From 63e3827c6bc5af9807b77e07fdcdae74b7d57161 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:38:08 +0100
Subject: [PATCH 01/32] Remove empty file.

Likely it was added by accident.
---
 try.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 try.py

diff --git a/try.py b/try.py
deleted file mode 100644
index e69de29bb..000000000
-- 
GitLab


From fa84ae26d62c7ac2ad6dca18b2d8b12ab83bc900 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:46:46 +0100
Subject: [PATCH 02/32] Reformat source code with black.

This is the result of:

    $ black --line-length 119 examples templates transformers utils hubconf.py setup.py

There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.

This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
---
 examples/benchmarks.py                        | 169 +--
 examples/contrib/run_camembert.py             |  37 +-
 examples/contrib/run_openai_gpt.py            | 167 +--
 examples/contrib/run_swag.py                  | 456 ++++----
 examples/contrib/run_transfo_xl.py            |  92 +-
 examples/distillation/distiller.py            | 348 ++++---
 .../distillation/grouped_batch_sampler.py     |  13 +-
 examples/distillation/lm_seqs_dataset.py      |  32 +-
 .../distillation/run_squad_w_distillation.py  | 666 +++++++-----
 .../distillation/scripts/binarized_data.py    |  67 +-
 examples/distillation/scripts/extract.py      |  94 +-
 .../scripts/extract_distilbert.py             |  85 +-
 examples/distillation/scripts/token_counts.py |  34 +-
 examples/distillation/train.py                | 320 +++---
 examples/distillation/utils.py                |  32 +-
 examples/mm-imdb/run_mmimdb.py                | 419 +++++---
 examples/mm-imdb/utils_mmimdb.py              |  58 +-
 examples/pplm/pplm_classification_head.py     |   1 +
 examples/pplm/run_pplm.py                     | 466 ++++-----
 examples/pplm/run_pplm_discrim_train.py       | 293 ++----
 examples/run_bertology.py                     | 252 +++--
 examples/run_generation.py                    |  64 +-
 examples/run_glue.py                          | 481 +++++----
 examples/run_lm_finetuning.py                 | 454 +++++---
 examples/run_multiple_choice.py               | 460 +++++---
 examples/run_ner.py                           | 382 ++++---
 examples/run_squad.py                         | 694 ++++++------
 examples/run_tf_glue.py                       |  66 +-
 examples/run_tf_ner.py                        | 496 ++++-----
 examples/run_xnli.py                          | 446 +++++---
 ...ert_bertabs_original_pytorch_checkpoint.py |  45 +-
 examples/summarization/modeling_bertabs.py    | 218 +---
 examples/summarization/run_summarization.py   |  67 +-
 examples/summarization/utils_summarization.py |  12 +-
 .../summarization/utils_summarization_test.py |  24 +-
 examples/test_examples.py                     |  88 +-
 examples/utils_multiple_choice.py             | 135 +--
 examples/utils_ner.py                         |  60 +-
 hubconf.py                                    |  11 +-
 setup.py                                      |  47 +-
 .../adding_a_new_example_script/run_xxx.py    | 583 +++++++----
 .../adding_a_new_example_script/utils_xxx.py  | 289 ++---
 .../adding_a_new_model/configuration_xxx.py   |  42 +-
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |  36 +-
 .../adding_a_new_model/modeling_tf_xxx.py     | 100 +-
 templates/adding_a_new_model/modeling_xxx.py  | 216 ++--
 .../tests/modeling_tf_xxx_test.py             | 173 +--
 .../tests/modeling_xxx_test.py                | 166 +--
 .../tests/tokenization_xxx_test.py            |  30 +-
 .../adding_a_new_model/tokenization_xxx.py    |  65 +-
 transformers/__init__.py                      | 441 +++++---
 transformers/__main__.py                      |  13 +-
 transformers/commands/__init__.py             |   1 +
 transformers/commands/convert.py              | 121 ++-
 transformers/commands/download.py             |  15 +-
 transformers/commands/run.py                  |  77 +-
 transformers/commands/serving.py              |  83 +-
 transformers/commands/train.py                | 146 +--
 transformers/commands/user.py                 |  72 +-
 transformers/configuration_albert.py          |  52 +-
 transformers/configuration_auto.py            |  81 +-
 transformers/configuration_bert.py            |  72 +-
 transformers/configuration_camembert.py       |   5 +-
 transformers/configuration_ctrl.py            |   4 +-
 transformers/configuration_distilbert.py      |  45 +-
 transformers/configuration_gpt2.py            |  16 +-
 transformers/configuration_mmbt.py            |   4 +-
 transformers/configuration_openai.py          |   4 +-
 transformers/configuration_roberta.py         |  15 +-
 transformers/configuration_t5.py              |  38 +-
 transformers/configuration_transfo_xl.py      |  66 +-
 transformers/configuration_utils.py           | 107 +-
 transformers/configuration_xlm.py             |  89 +-
 transformers/configuration_xlm_roberta.py     |  15 +-
 transformers/configuration_xlnet.py           |  55 +-
 ...lbert_original_tf_checkpoint_to_pytorch.py |  36 +-
 ..._bert_original_tf_checkpoint_to_pytorch.py |  36 +-
 ..._bert_pytorch_checkpoint_to_original_tf.py |  66 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |  42 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |  48 +-
 .../convert_pytorch_checkpoint_to_tf2.py      | 463 +++++---
 ..._original_pytorch_checkpoint_to_pytorch.py | 103 +-
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |  36 +-
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |  78 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  38 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  73 +-
 transformers/data/__init__.py                 |   9 +-
 transformers/data/metrics/__init__.py         |   7 +-
 transformers/data/metrics/squad_metrics.py    | 165 ++-
 transformers/data/processors/__init__.py      |   2 +-
 transformers/data/processors/glue.py          | 258 +++--
 transformers/data/processors/squad.py         | 107 +-
 transformers/data/processors/utils.py         | 112 +-
 transformers/data/processors/xnli.py          |  14 +-
 transformers/file_utils.py                    | 140 ++-
 transformers/hf_api.py                        |  46 +-
 transformers/modelcard.py                     |  80 +-
 transformers/modeling_albert.py               | 206 ++--
 transformers/modeling_auto.py                 | 306 ++++--
 transformers/modeling_bert.py                 | 474 ++++++---
 transformers/modeling_camembert.py            |  49 +-
 transformers/modeling_ctrl.py                 | 121 ++-
 transformers/modeling_distilbert.py           | 240 +++--
 transformers/modeling_encoder_decoder.py      |  25 +-
 transformers/modeling_gpt2.py                 | 173 +--
 transformers/modeling_mmbt.py                 | 150 ++-
 transformers/modeling_openai.py               | 147 ++-
 transformers/modeling_roberta.py              | 207 ++--
 transformers/modeling_t5.py                   | 260 +++--
 transformers/modeling_tf_albert.py            | 272 ++---
 transformers/modeling_tf_auto.py              | 268 +++--
 transformers/modeling_tf_bert.py              | 369 ++++---
 transformers/modeling_tf_ctrl.py              | 137 ++-
 transformers/modeling_tf_distilbert.py        | 244 +++--
 transformers/modeling_tf_gpt2.py              | 174 ++--
 transformers/modeling_tf_openai.py            | 167 +--
 transformers/modeling_tf_pytorch_utils.py     | 107 +-
 transformers/modeling_tf_roberta.py           | 102 +-
 transformers/modeling_tf_t5.py                | 296 +++---
 transformers/modeling_tf_transfo_xl.py        | 305 +++---
 .../modeling_tf_transfo_xl_utilities.py       |  87 +-
 transformers/modeling_tf_utils.py             | 179 ++--
 transformers/modeling_tf_xlm.py               | 236 +++--
 transformers/modeling_tf_xlnet.py             | 330 +++---
 transformers/modeling_transfo_xl.py           | 314 +++---
 transformers/modeling_transfo_xl_utilities.py |  77 +-
 transformers/modeling_utils.py                | 425 +++++---
 transformers/modeling_xlm.py                  | 304 ++++--
 transformers/modeling_xlm_roberta.py          |  59 +-
 transformers/modeling_xlnet.py                | 610 +++++++----
 transformers/optimization.py                  |  60 +-
 transformers/optimization_tf.py               | 313 +++---
 transformers/pipelines.py                     | 470 +++++----
 .../tests/configuration_common_test.py        |  11 +-
 transformers/tests/hf_api_test.py             |  28 +-
 transformers/tests/model_card_test.py         |  64 +-
 transformers/tests/modeling_albert_test.py    | 143 +--
 transformers/tests/modeling_auto_test.py      |  23 +-
 transformers/tests/modeling_bert_test.py      | 310 ++++--
 transformers/tests/modeling_common_test.py    | 271 ++---
 transformers/tests/modeling_ctrl_test.py      | 112 +-
 .../tests/modeling_distilbert_test.py         | 130 +--
 .../tests/modeling_encoder_decoder_test.py    |   6 +-
 transformers/tests/modeling_gpt2_test.py      | 154 +--
 transformers/tests/modeling_openai_test.py    | 119 ++-
 transformers/tests/modeling_roberta_test.py   | 185 ++--
 transformers/tests/modeling_t5_test.py        | 143 ++-
 transformers/tests/modeling_tf_albert_test.py | 153 ++-
 transformers/tests/modeling_tf_auto_test.py   |  28 +-
 transformers/tests/modeling_tf_bert_test.py   | 233 +++--
 transformers/tests/modeling_tf_common_test.py |  90 +-
 transformers/tests/modeling_tf_ctrl_test.py   | 105 +-
 .../tests/modeling_tf_distilbert_test.py      | 128 +--
 transformers/tests/modeling_tf_gpt2_test.py   | 142 +--
 .../tests/modeling_tf_openai_gpt_test.py      | 144 +--
 .../tests/modeling_tf_roberta_test.py         | 166 ++-
 transformers/tests/modeling_tf_t5_test.py     |  93 +-
 .../tests/modeling_tf_transfo_xl_test.py      |  97 +-
 transformers/tests/modeling_tf_xlm_test.py    | 251 +++--
 transformers/tests/modeling_tf_xlnet_test.py  | 281 +++--
 .../tests/modeling_transfo_xl_test.py         |  89 +-
 transformers/tests/modeling_xlm_test.py       | 311 +++---
 transformers/tests/modeling_xlnet_test.py     | 357 +++++--
 transformers/tests/optimization_test.py       |  33 +-
 transformers/tests/optimization_tf_test.py    |  10 +-
 transformers/tests/pipelines_test.py          | 122 +--
 .../tests/tokenization_albert_test.py         |  35 +-
 transformers/tests/tokenization_auto_test.py  |   1 +
 .../tests/tokenization_bert_japanese_test.py  | 129 +--
 transformers/tests/tokenization_bert_test.py  | 104 +-
 transformers/tests/tokenization_ctrl_test.py  |  20 +-
 .../tests/tokenization_distilbert_test.py     |  10 +-
 transformers/tests/tokenization_gpt2_test.py  |  41 +-
 .../tests/tokenization_openai_test.py         |  41 +-
 .../tests/tokenization_roberta_test.py        |  52 +-
 transformers/tests/tokenization_t5_test.py    |  92 +-
 .../tests/tokenization_tests_commons.py       | 177 ++--
 .../tests/tokenization_transfo_xl_test.py     |  38 +-
 transformers/tests/tokenization_utils_test.py |   3 +-
 transformers/tests/tokenization_xlm_test.py   |  42 +-
 transformers/tests/tokenization_xlnet_test.py | 154 ++-
 transformers/tests/utils.py                   |   1 +
 transformers/tokenization_albert.py           | 118 ++-
 transformers/tokenization_auto.py             |  46 +-
 transformers/tokenization_bert.py             | 190 ++--
 transformers/tokenization_bert_japanese.py    | 142 +--
 transformers/tokenization_camembert.py        |  60 +-
 transformers/tokenization_ctrl.py             |  68 +-
 transformers/tokenization_distilbert.py       |  21 +-
 transformers/tokenization_gpt2.py             | 120 ++-
 transformers/tokenization_openai.py           |  92 +-
 transformers/tokenization_roberta.py          |  90 +-
 transformers/tokenization_t5.py               |  79 +-
 transformers/tokenization_transfo_xl.py       | 267 ++---
 transformers/tokenization_utils.py            | 448 ++++----
 transformers/tokenization_xlm.py              | 985 +++++++++---------
 transformers/tokenization_xlm_roberta.py      |  75 +-
 transformers/tokenization_xlnet.py            | 102 +-
 utils/download_glue_data.py                   |  79 +-
 utils/link_tester.py                          |   2 +-
 200 files changed, 17612 insertions(+), 12754 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 26c260b9e..20b62112b 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -247,16 +247,18 @@ the wall, slowly on into the Social Predestination Room.
 as they entered."""
 
 
-def create_setup_and_compute(model_names: List[str],
-                             gpu: bool = True,
-                             tensorflow: bool = False,
-                             average_over: int = 3,
-                             torchscript: bool = False,
-                             xla: bool = False,
-                             amp: bool = False,
-                             fp16: bool = False,
-                             save_to_csv: bool = False,
-                             csv_filename: str = f"results_{round(time())}.csv"):
+def create_setup_and_compute(
+    model_names: List[str],
+    gpu: bool = True,
+    tensorflow: bool = False,
+    average_over: int = 3,
+    torchscript: bool = False,
+    xla: bool = False,
+    amp: bool = False,
+    fp16: bool = False,
+    save_to_csv: bool = False,
+    csv_filename: str = f"results_{round(time())}.csv",
+):
     if xla:
         tf.config.optimizer.set_jit(True)
     if amp:
@@ -266,7 +268,7 @@ def create_setup_and_compute(model_names: List[str],
         dictionary = {model_name: {} for model_name in model_names}
         results = _compute_tensorflow(model_names, dictionary, average_over, amp)
     else:
-        device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
+        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
         dictionary = {model_name: {} for model_name in model_names}
         results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
 
@@ -276,34 +278,52 @@ def create_setup_and_compute(model_names: List[str],
         for batch_size in results[model_name]["bs"]:
             print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
             for slice_size in results[model_name]["ss"]:
-                result = results[model_name]['results'][batch_size][slice_size]
+                result = results[model_name]["results"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{result}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
                 else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                          f"{(round(1000 * result) / 1000)}"
-                          f"s")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
 
     if save_to_csv:
-        with open(csv_filename, mode='w') as csv_file:
-            fieldnames = ['model',
-                          '1x8', '1x64', '1x128', '1x256', '1x512', '1x1024',
-                          '2x8', '2x64', '2x128', '2x256', '2x512', '2x1024',
-                          '4x8', '4x64', '4x128', '4x256', '4x512', '4x1024',
-                          '8x8', '8x64', '8x128', '8x256', '8x512', '8x1024',
-                          ]
+        with open(csv_filename, mode="w") as csv_file:
+            fieldnames = [
+                "model",
+                "1x8",
+                "1x64",
+                "1x128",
+                "1x256",
+                "1x512",
+                "1x1024",
+                "2x8",
+                "2x64",
+                "2x128",
+                "2x256",
+                "2x512",
+                "2x1024",
+                "4x8",
+                "4x64",
+                "4x128",
+                "4x256",
+                "4x512",
+                "4x1024",
+                "8x8",
+                "8x64",
+                "8x128",
+                "8x256",
+                "8x512",
+                "8x1024",
+            ]
 
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()
 
             for model_name in model_names:
                 model_results = {
-                    f'{bs}x{ss}': results[model_name]['results'][bs][ss]
+                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
                     for bs in results[model_name]["results"]
-                    for ss in results[model_name]['results'][bs]
+                    for ss in results[model_name]["results"][bs]
                 }
-                writer.writerow({'model': model_name, **model_results})
+                writer.writerow({"model": model_name, **model_results})
 
 
 def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
@@ -343,7 +363,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,
 
                         print("Going through model with sequence of shape", sequence.shape)
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
                     except RuntimeError as e:
                         print("Doesn't fit on GPU.", e)
@@ -379,7 +399,9 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                 if max_input_size is not None and slice_size > max_input_size:
                     dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                 else:
-                    sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size)
+                    sequence = tf.stack(
+                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
+                    )
 
                     try:
                         print("Going through model with sequence of shape", sequence.shape)
@@ -387,7 +409,7 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
                         inference(sequence)
 
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
                     except tf.errors.ResourceExhaustedError as e:
                         print("Doesn't fit on GPU.", e)
@@ -399,33 +421,64 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
 def main():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided "
-                                                                                  "to the AutoModel classes. Leave "
-                                                                                  "blank to benchmark the base version "
-                                                                                  "of all available model "
-                                                                                  "architectures.")
-    parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the "
-                                                                             "models")
-    parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available "
-                                                                                  "cuda devices")
-    parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models "
-                                                                                   "using torchscript")
-    parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version "
-                                                                                  "of the models. Will run on GPU if "
-                                                                                  "the correct dependencies are "
-                                                                                  "installed")
+    parser.add_argument(
+        "--models",
+        required=False,
+        type=str,
+        default="all",
+        help="Model checkpoints to be provided "
+        "to the AutoModel classes. Leave "
+        "blank to benchmark the base version "
+        "of all available model "
+        "architectures.",
+    )
+    parser.add_argument(
+        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
+    )
+    parser.add_argument(
+        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
+    )
+    parser.add_argument(
+        "--torchscript",
+        required=False,
+        action="store_true",
+        help="Pytorch only: trace the models " "using torchscript",
+    )
+    parser.add_argument(
+        "--tensorflow",
+        required=False,
+        action="store_true",
+        help="Benchmark the TensorFlow version "
+        "of the models. Will run on GPU if "
+        "the correct dependencies are "
+        "installed",
+    )
     parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.")
-    parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.")
-    parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
-                                                                                     "instead of model() to do a "
-                                                                                     "forward pass.")
+    parser.add_argument(
+        "--amp",
+        required=False,
+        action="store_true",
+        help="TensorFlow only: use automatic mixed precision acceleration.",
+    )
+    parser.add_argument(
+        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
+    )
+    parser.add_argument(
+        "--keras_predict",
+        required=False,
+        action="store_true",
+        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
+    )
     parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.")
-    parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.")
+    parser.add_argument(
+        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
+    )
+    parser.add_argument(
+        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
+    )
 
     args = parser.parse_args()
-    if args.models == 'all':
+    if args.models == "all":
         args.models = [
             "gpt2",
             "bert-base-cased",
@@ -436,7 +489,7 @@ def main():
             "distilbert-base-uncased",
             "distilgpt2",
             "roberta-base",
-            "ctrl"
+            "ctrl",
         ]
     else:
         args.models = args.models.split()
@@ -453,7 +506,7 @@ def main():
                 fp16=args.fp16,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
             )
         else:
             raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
@@ -467,11 +520,11 @@ def main():
                 amp=args.amp,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
-                average_over=args.average_over
+                average_over=args.average_over,
             )
         else:
             raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
 
-if __name__ == '__main__':
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
index 28144d516..99f54f544 100644
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -10,38 +10,37 @@ from transformers.modeling_camembert import CamembertForMaskedLM
 
 def fill_mask(masked_input, model, tokenizer, topk=5):
     # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
-    assert masked_input.count('<mask>') == 1
+    assert masked_input.count("<mask>") == 1
     input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
     logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
     masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
     logits = logits[0, masked_index, :]
     prob = logits.softmax(dim=0)
     values, indices = prob.topk(k=topk, dim=0)
-    topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
-                                         for i in range(len(indices))])
+    topk_predicted_token_bpe = " ".join(
+        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
+    )
     masked_token = tokenizer.mask_token
     topk_filled_outputs = []
-    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
-        predicted_token = predicted_token_bpe.replace('\u2581', ' ')
+    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
+        predicted_token = predicted_token_bpe.replace("\u2581", " ")
         if " {0}".format(masked_token) in masked_input:
-            topk_filled_outputs.append((
-                masked_input.replace(
-                    ' {0}'.format(masked_token), predicted_token
-                ),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(" {0}".format(masked_token), predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
         else:
-            topk_filled_outputs.append((
-                masked_input.replace(masked_token, predicted_token),
-                values[index].item(),
-                predicted_token,
-            ))
+            topk_filled_outputs.append(
+                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+            )
     return topk_filled_outputs
 
 
-tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
-model = CamembertForMaskedLM.from_pretrained('camembert-base')
+tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
+model = CamembertForMaskedLM.from_pretrained("camembert-base")
 model.eval()
 
 masked_input = "Le camembert est <mask> :)"
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index bc5695bec..f6431c80b 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -36,34 +36,42 @@ from tqdm import tqdm, trange
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-
-from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
-                                     get_linear_schedule_with_warmup)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+
+from transformers import (
+    OpenAIGPTDoubleHeadsModel,
+    OpenAIGPTTokenizer,
+    AdamW,
+    cached_path,
+    WEIGHTS_NAME,
+    CONFIG_NAME,
+    get_linear_schedule_with_warmup,
+)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def accuracy(out, labels):
     outputs = np.argmax(out, axis=1)
     return np.sum(outputs == labels)
 
+
 def load_rocstories_dataset(dataset_path):
     """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
+    with open(dataset_path, encoding="utf_8") as f:
         f = csv.reader(f)
         output = []
-        next(f) # skip the first line
+        next(f)  # skip the first line
         for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
     return output
 
+
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
     """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 
@@ -80,56 +88,68 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, :len(with_cont1)] = with_cont1
-            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
             mc_token_ids[i, 0] = len(with_cont1) - 1
             mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)] = with_cont1
-            lm_labels[i, 1, :len(with_cont2)] = with_cont2
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
             mc_labels[i] = mc_label
         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
     return tensor_datasets
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
-                        help='pretrained model name')
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument('--train_dataset', type=str, default='')
-    parser.add_argument('--eval_dataset', type=str, default='')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_train_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--eval_batch_size', type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training \
-                        steps to perform. Override num_train_epochs.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before\
-                        performing a backward/update pass.")
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.9)
-    parser.add_argument('--n_valid', type=int, default=374)
-
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--train_dataset", type=str, default="")
+    parser.add_argument("--eval_dataset", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", type=int, default=1)
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training \
+                        steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before\
+                        performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--lm_coef", type=float, default=0.9)
+    parser.add_argument("--n_valid", type=int, default=374)
+
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
     print(args)
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -152,7 +172,7 @@ def main():
     # Load tokenizer and model
     # This loading functions also add new tokens and embeddings called `special tokens`
     # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
     tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
     tokenizer.add_tokens(special_tokens)
     special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
@@ -163,6 +183,7 @@ def main():
     # Load and encode the datasets
     if not args.train_dataset and not args.eval_dataset:
         roc_stories = cached_path(ROCSTORIES_URL)
+
     def tokenize_and_encode(obj):
         """ Tokenize and encode a nested object """
         if isinstance(obj, str):
@@ -170,6 +191,7 @@ def main():
         elif isinstance(obj, int):
             return obj
         return list(tokenize_and_encode(o) for o in obj)
+
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
     eval_dataset = load_rocstories_dataset(args.eval_dataset)
@@ -178,8 +200,11 @@ def main():
 
     # Compute the max input length for the Transformer
     max_length = model.config.n_positions // 2 - 2
-    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
-                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = max(
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
+        for dataset in encoded_datasets
+        for story, cont1, cont2, _ in dataset
+    )
     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
 
     # Prepare inputs tensors and dataloaders
@@ -198,20 +223,23 @@ def main():
     if args.do_train:
         if args.max_steps > 0:
             t_total = args.max_steps
-            args.num_train_epochs = args.max_steps //\
-                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
         else:
-            t_total = len(train_dataloader)\
-                // args.gradient_accumulation_steps * args.num_train_epochs
+            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
         param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": args.weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+        )
 
     if args.do_train:
         nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@@ -230,14 +258,16 @@ def main():
                 optimizer.step()
                 optimizer.zero_grad()
                 tr_loss += loss.item()
-                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                exp_average_loss = (
+                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
+                )
                 nb_tr_steps += 1
                 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
 
     # Save a trained model
     if args.do_train:
         # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model itself
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
@@ -260,10 +290,12 @@ def main():
             batch = tuple(t.to(device) for t in batch)
             input_ids, mc_token_ids, lm_labels, mc_labels = batch
             with torch.no_grad():
-               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
+                _, mc_loss, _, mc_logits = model(
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
+                )
 
             mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to('cpu').numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
             tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
 
             eval_loss += mc_loss.mean().item()
@@ -274,10 +306,8 @@ def main():
 
         eval_loss = eval_loss / nb_eval_steps
         eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss/nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'train_loss': train_loss}
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
@@ -286,5 +316,6 @@ def main():
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 5de93db7f..d03d1aace 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -28,8 +28,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -39,31 +38,23 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer)
+from transformers import WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in [BertConfig]), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
 }
 
+
 class SwagExample(object):
     """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
+
+    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
         self.swag_id = swag_id
         self.context_sentence = context_sentence
         self.start_ending = start_ending
@@ -94,57 +85,49 @@ class SwagExample(object):
 
         return ", ".join(l)
 
-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
 
-    ):
+class InputFeatures(object):
+    def __init__(self, example_id, choices_features, label):
         self.example_id = example_id
         self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
             for _, input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
 
+
 def read_swag_examples(input_file, is_training=True):
-    with open(input_file, 'r', encoding='utf-8') as f:
+    with open(input_file, "r", encoding="utf-8") as f:
         reader = csv.reader(f)
         lines = []
         for line in reader:
             if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
+                line = list(unicode(cell, "utf-8") for cell in line)
             lines.append(line)
 
-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
+    if is_training and lines[0][-1] != "label":
+        raise ValueError("For training, the input file must contain a label column.")
 
     examples = [
         SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None,
+        )
+        for line in lines[1:]  # we skip the line with the column names
     ]
 
     return examples
 
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
     # Swag is a multiple choice task. To perform this task using Bert,
@@ -204,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
             logger.info("swag_id: {}".format(example.swag_id))
             for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("tokens: {}".format(" ".join(tokens)))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
             if is_training:
                 logger.info("label: {}".format(label))
 
-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
+        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
 
     return features
 
+
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
     """Truncates a sequence pair in place to the maximum length."""
 
@@ -237,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
         else:
             tokens_b.pop()
 
+
 def accuracy(out, labels):
     outputs = np.argmax(out, axis=1)
     return np.sum(outputs == labels)
 
+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
 
 
 def set_seed(args):
@@ -258,24 +232,28 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
     if args.local_rank not in [-1, 0]:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
         examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(
-            examples, tokenizer, args.max_seq_length, not evaluate)
+        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
 
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -285,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
     all_label = torch.tensor([f.label for f in features], dtype=torch.long)
 
     if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
     else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
 
     if output_examples:
         return dataset, examples, features
     return dataset
+
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -316,13 +294,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -336,17 +319,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -360,11 +347,13 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
             # if args.model_type in ['xlnet', 'xlm']:
             #     inputs.update({'cls_index': batch[5],
             #                    'p_mask':       batch[6]})
@@ -372,7 +361,7 @@ def train(args, train_dataset, model, tokenizer):
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -393,23 +382,27 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -424,6 +417,7 @@ def train(args, train_dataset, model, tokenizer):
 
     return global_step, tr_loss / global_step
 
+
 def evaluate(args, model, tokenizer, prefix=""):
     dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
 
@@ -440,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
 
-
     eval_loss, eval_accuracy = 0, 0
     nb_eval_steps, nb_eval_examples = 0, 0
 
@@ -448,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
 
             # if args.model_type in ['xlnet', 'xlm']:
             #     inputs.update({'cls_index': batch[4],
@@ -462,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""):
             eval_loss += tmp_eval_loss.mean().item()
 
         logits = logits.detach().cpu().numpy()
-        label_ids = inputs['labels'].to('cpu').numpy()
+        label_ids = inputs["labels"].to("cpu").numpy()
         tmp_eval_accuracy = accuracy(logits, label_ids)
         eval_accuracy += tmp_eval_accuracy
 
         nb_eval_steps += 1
-        nb_eval_examples += inputs['input_ids'].size(0)
+        nb_eval_examples += inputs["input_ids"].size(0)
 
     eval_loss = eval_loss / nb_eval_steps
     eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {'eval_loss': eval_loss,
-              'eval_accuracy': eval_accuracy}
+    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
 
     output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
     with open(output_eval_file, "w") as writer:
@@ -483,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     return result
 
+
 def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SWAG csv for training. E.g., train.csv")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SWAG csv for predictions. E.g., val.csv or test.csv")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SWAG csv for predictions. E.g., val.csv or test.csv",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -580,16 +626,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -601,8 +655,12 @@ def main():
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -617,7 +675,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         # Create output directory if needed
@@ -627,19 +684,20 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
@@ -650,14 +708,16 @@ def main():
             checkpoints = [args.model_name_or_path]
 
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             tokenizer = tokenizer_class.from_pretrained(checkpoint)
             model.to(args.device)
@@ -665,7 +725,7 @@ def main():
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index f5375269b..1ef66bef1 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -30,44 +30,36 @@ import torch
 
 from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def main():
-    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                        help='pretrained model name')
-    parser.add_argument('--split', type=str, default='test',
-                        choices=['all', 'valid', 'test'],
-                        help='which split to evaluate')
-    parser.add_argument('--batch_size', type=int, default=10,
-                        help='batch size')
-    parser.add_argument('--tgt_len', type=int, default=128,
-                        help='number of tokens to predict')
-    parser.add_argument('--ext_len', type=int, default=0,
-                        help='length of the extended context')
-    parser.add_argument('--mem_len', type=int, default=1600,
-                        help='length of the retained previous heads')
-    parser.add_argument('--clamp_len', type=int, default=1000,
-                        help='max positional embedding index')
-    parser.add_argument('--no_cuda', action='store_true',
-                        help='Do not use CUDA even though CUA is available')
-    parser.add_argument('--work_dir', type=str, required=True,
-                        help='path to the work_dir')
-    parser.add_argument('--no_log', action='store_true',
-                        help='do not log the eval result')
-    parser.add_argument('--same_length', action='store_true',
-                        help='set same length attention with masking')
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
+    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
+    )
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
-    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.ext_len >= 0, "extended context length must be non-negative"
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -84,17 +76,18 @@ def main():
     corpus = TransfoXLCorpus.from_pretrained(args.model_name)
     ntokens = len(corpus.vocab)
 
-    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
 
     # Load a pre-trained model
     model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
     model = model.to(device)
 
-    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
-        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
+            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
+        )
+    )
 
     model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
     if args.clamp_len > 0:
@@ -108,7 +101,7 @@ def main():
     def evaluate(eval_iter):
         # Turn on evaluation mode which disables dropout.
         model.eval()
-        total_len, total_loss = 0, 0.
+        total_len, total_loss = 0, 0.0
         start_time = time.time()
         with torch.no_grad():
             mems = None
@@ -119,35 +112,34 @@ def main():
                 total_loss += seq_len * loss.item()
                 total_len += seq_len
             total_time = time.time() - start_time
-        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
-                total_time, 1000 * total_time / (idx+1)))
+        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
         return total_loss / total_len
 
     # Run on test data.
-    if args.split == 'all':
+    if args.split == "all":
         test_loss = evaluate(te_iter)
         valid_loss = evaluate(va_iter)
-    elif args.split == 'valid':
+    elif args.split == "valid":
         valid_loss = evaluate(va_iter)
         test_loss = None
-    elif args.split == 'test':
+    elif args.split == "test":
         test_loss = evaluate(te_iter)
         valid_loss = None
 
     def format_log(loss, split):
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-            split, loss, math.exp(loss))
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
         return log_str
 
-    log_str = ''
+    log_str = ""
     if valid_loss is not None:
-        log_str += format_log(valid_loss, 'valid')
+        log_str += format_log(valid_loss, "valid")
     if test_loss is not None:
-        log_str += format_log(test_loss, 'test')
+        log_str += format_log(test_loss, "test")
 
-    logger.info('=' * 100)
+    logger.info("=" * 100)
     logger.info(log_str)
-    logger.info('=' * 100)
+    logger.info("=" * 100)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index d5a86247a..e3bf0d443 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -40,14 +40,12 @@ from utils import logger
 from lm_seqs_dataset import LmSeqsDataset
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 
+
 class Distiller:
-    def __init__(self,
-                 params: dict,
-                 dataset: LmSeqsDataset,
-                 token_probs: torch.tensor,
-                 student: nn.Module,
-                 teacher: nn.Module):
-        logger.info('Initializing Distiller')
+    def __init__(
+        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
+    ):
+        logger.info("Initializing Distiller")
         self.params = params
         self.dump_path = params.dump_path
         self.multi_gpu = params.multi_gpu
@@ -70,12 +68,10 @@ class Distiller:
         else:
             sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
 
-        self.dataloader = DataLoader(dataset=dataset,
-                                     batch_sampler=sampler,
-                                     collate_fn=dataset.batch_sequences)
+        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
 
         self.temperature = params.temperature
-        assert self.temperature > 0.
+        assert self.temperature > 0.0
 
         self.alpha_ce = params.alpha_ce
         self.alpha_mlm = params.alpha_mlm
@@ -85,18 +81,18 @@ class Distiller:
 
         self.mlm = params.mlm
         if self.mlm:
-            logger.info(f'Using MLM loss for LM step.')
+            logger.info(f"Using MLM loss for LM step.")
             self.mlm_mask_prop = params.mlm_mask_prop
             assert 0.0 <= self.mlm_mask_prop <= 1.0
             assert params.word_mask + params.word_keep + params.word_rand == 1.0
             self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
             if self.fp16:
                 self.pred_probs = self.pred_probs.half()
                 self.token_probs = self.token_probs.half()
         else:
-            logger.info(f'Using CLM loss for LM step.')
+            logger.info(f"Using CLM loss for LM step.")
 
         self.epoch = 0
         self.n_iter = 0
@@ -107,38 +103,54 @@ class Distiller:
         self.last_loss_ce = 0
         self.last_loss_mlm = 0
         self.last_loss_clm = 0
-        if self.alpha_mse > 0.: self.last_loss_mse = 0
-        if self.alpha_cos > 0.: self.last_loss_cos = 0
+        if self.alpha_mse > 0.0:
+            self.last_loss_mse = 0
+        if self.alpha_cos > 0.0:
+            self.last_loss_cos = 0
         self.last_log = 0
 
-        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
+        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
         self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-        if self.alpha_mse > 0.:
-            self.mse_loss_fct = nn.MSELoss(reduction='sum')
-        if self.alpha_cos > 0.:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
+        if self.alpha_mse > 0.0:
+            self.mse_loss_fct = nn.MSELoss(reduction="sum")
+        if self.alpha_cos > 0.0:
+            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
 
-        logger.info('--- Initializing model optimizer')
+        logger.info("--- Initializing model optimizer")
         assert params.gradient_accumulation_steps >= 1
         self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        num_train_optimization_steps = (
+            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        )
 
-        no_decay = ['bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
-            {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": params.weight_decay,
+            },
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": 0.0,
+            },
         ]
-        logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
+        logger.info(
+            "------ Number of trainable parameters (student): %i"
+            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
+        )
         logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(optimizer_grouped_parameters,
-                               lr=params.learning_rate,
-                               eps=params.adam_epsilon,
-                               betas=(0.9, 0.98))
+        self.optimizer = AdamW(
+            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
+        )
 
         warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
-                                                num_warmup_steps=warmup_steps,
-                                                num_training_steps=num_train_optimization_steps)
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
+        )
 
         if self.fp16:
             try:
@@ -146,33 +158,36 @@ class Distiller:
             except ImportError:
                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
             logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(self.student,
-                                                          self.optimizer,
-                                                          opt_level=self.params.fp16_opt_level)
+            self.student, self.optimizer = amp.initialize(
+                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
+            )
             self.teacher = self.teacher.half()
 
         if self.multi_gpu:
             if self.fp16:
                 from apex.parallel import DistributedDataParallel
+
                 logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                 self.student = DistributedDataParallel(self.student)
             else:
                 from torch.nn.parallel import DistributedDataParallel
+
                 logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student,
-                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank,
-                                                       find_unused_parameters=True)
+                self.student = DistributedDataParallel(
+                    self.student,
+                    device_ids=[params.local_rank],
+                    output_device=params.local_rank,
+                    find_unused_parameters=True,
+                )
 
         self.is_master = params.is_master
         if self.is_master:
-            logger.info('--- Initializing Tensorboard')
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
+            logger.info("--- Initializing Tensorboard")
+            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
+            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
 
-    def prepare_batch_mlm(self,
-                          batch):
+    def prepare_batch_mlm(self, batch):
         """
         Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
 
@@ -192,7 +207,7 @@ class Distiller:
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
         assert token_ids.size(0) == lengths.size(0)
 
-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
 
         bs, max_seq_len = token_ids.size()
         mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
@@ -200,11 +215,13 @@ class Distiller:
         x_prob = self.token_probs[token_ids.flatten()]
         n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
         tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
+        pred_mask = torch.zeros(
+            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
+        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
         pred_mask[tgt_ids] = 1
         pred_mask = pred_mask.view(bs, max_seq_len)
 
-        pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
+        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
 
         # mask a number of words == 0 [8] (faster with fp16)
         if self.fp16:
@@ -213,26 +230,29 @@ class Distiller:
                 pred_mask = pred_mask.view(-1)
                 n2 = max(n1 % 8, 8 * (n1 // 8))
                 if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
+                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
                 pred_mask = pred_mask.view(bs, max_seq_len)
                 assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
 
         _token_ids_real = token_ids[pred_mask]
         _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
+        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
         probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
+        _token_ids = (
+            _token_ids_mask * (probs == 0).long()
+            + _token_ids_real * (probs == 1).long()
+            + _token_ids_rand * (probs == 2).long()
+        )
         token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
 
-        mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
+        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
 
         # sanity checks
         assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
 
         return token_ids, attn_mask, mlm_labels
 
-    def prepare_batch_clm(self,
-                          batch):
+    def prepare_batch_clm(self, batch):
         """
         Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
 
@@ -252,18 +272,16 @@ class Distiller:
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
         assert token_ids.size(0) == lengths.size(0)
 
-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
         clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
 
         # sanity checks
         assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
 
         return token_ids, attn_mask, clm_labels
 
-    def round_batch(self,
-                    x: torch.tensor,
-                    lengths: torch.tensor):
+    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
         """
         For float16 only.
         Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
@@ -299,9 +317,9 @@ class Distiller:
             pad = 8 - (ml1 % 8)
             ml2 = ml1 + pad
             if self.mlm:
-                pad_id = self.params.special_tok_ids['pad_token']
+                pad_id = self.params.special_tok_ids["pad_token"]
             else:
-                pad_id = self.params.special_tok_ids['unk_token']
+                pad_id = self.params.special_tok_ids["unk_token"]
             padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
             x = torch.cat([x, padding_tensor], 1)
             assert x.size() == (bs2, ml2)
@@ -314,20 +332,22 @@ class Distiller:
         """
         The real training loop.
         """
-        if self.is_master: logger.info('Starting training')
+        if self.is_master:
+            logger.info("Starting training")
         self.last_log = time.time()
         self.student.train()
         self.teacher.eval()
 
         for _ in range(self.params.n_epoch):
-            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
             if self.multi_gpu:
                 torch.distributed.barrier()
 
             iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
             for batch in iter_bar:
                 if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
+                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
 
                 if self.mlm:
                     token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
@@ -336,22 +356,21 @@ class Distiller:
                 self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
 
                 iter_bar.update()
-                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
-                                      'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
+                iter_bar.set_postfix(
+                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
+                )
             iter_bar.close()
 
-            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
             self.end_epoch()
 
         if self.is_master:
-            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
-            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
-            logger.info('Training is finished')
-
-    def step(self,
-             input_ids: torch.tensor,
-             attention_mask: torch.tensor,
-             lm_labels: torch.tensor):
+            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Training is finished")
+
+    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
         """
         One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
         and possibly a parameter update (depending on the gradient accumulation).
@@ -363,78 +382,91 @@ class Distiller:
         lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
         """
         if self.mlm:
-            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            s_logits, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=attention_mask
+            )  # (bs, seq_length, voc_size)
             with torch.no_grad():
-                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+                t_logits, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )  # (bs, seq_length, voc_size)
         else:
-            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            s_logits, _, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=None
+            )  # (bs, seq_length, voc_size)
             with torch.no_grad():
-                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
+                t_logits, _, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=None
+                )  # (bs, seq_length, voc_size)
         assert s_logits.size() == t_logits.size()
 
-        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
+        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
         if self.params.restrict_ce_to_mask:
-            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
         else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
+        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
         assert t_logits_slct.size() == s_logits_slct.size()
 
-        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
-                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
-        loss = self.alpha_ce*loss_ce
+        loss_ce = (
+            self.ce_loss_fct(
+                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                F.softmax(t_logits_slct / self.temperature, dim=-1),
+            )
+            * (self.temperature) ** 2
+        )
+        loss = self.alpha_ce * loss_ce
 
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
             loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
             loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
             shift_logits = s_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                                        shift_labels.view(-1))
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             loss += self.alpha_clm * loss_clm
 
-        if self.alpha_mse > 0.:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
+        if self.alpha_mse > 0.0:
+            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
+                0
+            )  # Reproducing batchmean reduction
             loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.:
-            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
+        if self.alpha_cos > 0.0:
+            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
+            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
             assert s_hidden_states.size() == t_hidden_states.size()
             dim = s_hidden_states.size(-1)
-            
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-        
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
+
+            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
+            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
+            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+
+            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
             loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
             loss += self.alpha_cos * loss_cos
 
         self.total_loss_epoch += loss.item()
         self.last_loss = loss.item()
         self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
             self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
             self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.:
+        if self.alpha_mse > 0.0:
             self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.:
+        if self.alpha_cos > 0.0:
             self.last_loss_cos = loss_cos.item()
 
         self.optimize(loss)
 
         self.n_sequences_epoch += input_ids.size(0)
 
-    def optimize(self,
-                 loss):
+    def optimize(self, loss):
         """
         Normalization on the loss (gradient accumulation or distributed training), followed by
         backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
@@ -442,7 +474,7 @@ class Distiller:
         """
         # Check for NaN
         if (loss != loss).data.any():
-            logger.error('NaN detected')
+            logger.error("NaN detected")
             exit()
 
         if self.multi_gpu:
@@ -452,6 +484,7 @@ class Distiller:
 
         if self.fp16:
             from apex import amp
+
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
         else:
@@ -488,53 +521,84 @@ class Distiller:
             return
 
         for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
+            )
             if param.grad is None:
                 continue
-            self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
-
-        self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
+            )
+
+        self.tensorboard.add_scalar(
+            tag="losses/cum_avg_loss_epoch",
+            scalar_value=self.total_loss_epoch / self.n_iter,
+            global_step=self.n_total_iter,
+        )
         self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
-        if self.alpha_mlm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
-        if self.alpha_clm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
-        if self.alpha_mse > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
-        if self.alpha_cos > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
-        
-        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
+        )
+        if self.alpha_mlm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
+            )
+        if self.alpha_clm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
+            )
+        if self.alpha_mse > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
+            )
+        if self.alpha_cos > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
+            )
+        self.tensorboard.add_scalar(
+            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
+        )
+
+        self.tensorboard.add_scalar(
+            tag="global/memory_usage",
+            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
+            global_step=self.n_total_iter,
+        )
+        self.tensorboard.add_scalar(
+            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
+        )
 
     def end_epoch(self):
         """
         Finally arrived at the end of epoch (full pass on dataset).
         Do some tensorboard logging and checkpoint saving.
         """
-        logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
+        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
 
         if self.is_master:
-            self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
-            self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
+            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
+            self.tensorboard.add_scalar(
+                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
+            )
 
         self.epoch += 1
         self.n_sequences_epoch = 0
         self.n_iter = 0
         self.total_loss_epoch = 0
 
-    def save_checkpoint(self,
-                        checkpoint_name: str = 'checkpoint.pth'):
+    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
         """
         Save the current state. Only by the master process.
         """
         if not self.is_master:
             return
-        mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
+        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
         mdl_to_save.config.save_pretrained(self.dump_path)
         state_dict = mdl_to_save.state_dict()
         torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
diff --git a/examples/distillation/grouped_batch_sampler.py b/examples/distillation/grouped_batch_sampler.py
index 46d943a3d..1132fdb58 100644
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -23,12 +23,14 @@ from torch.utils.data.sampler import BatchSampler, Sampler
 
 from utils import logger
 
+
 def _quantize(x, bins):
     bins = copy.deepcopy(bins)
     bins = sorted(bins)
     quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
     return quantized
 
+
 def create_lengths_groups(lengths, k=0):
     bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
     groups = _quantize(lengths, bins)
@@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0):
     logger.info("Count of instances per bin: {}".format(counts))
     return groups
 
+
 class GroupedBatchSampler(BatchSampler):
     """
     Wraps another sampler to yield a mini-batch of indices.
@@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler):
             0, i.e. they must be in the range [0, num_groups).
         batch_size (int): Size of mini-batch.
     """
+
     def __init__(self, sampler, group_ids, batch_size):
         if not isinstance(sampler, Sampler):
             raise ValueError(
-                "sampler should be an instance of "
-                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
             )
         self.sampler = sampler
         self.group_ids = group_ids
@@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler):
             buffer_per_group[group_id].append(idx)
             samples_per_group[group_id].append(idx)
             if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id] #TODO
+                yield buffer_per_group[group_id]  # TODO
                 num_batches += 1
                 del buffer_per_group[group_id]
             assert len(buffer_per_group[group_id]) < self.batch_size
@@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler):
             for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                 batch_idx.extend(idxs)
                 if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[:self.batch_size]
-                    batch_idx = batch_idx[self.batch_size:]
+                    yield batch_idx[: self.batch_size]
+                    batch_idx = batch_idx[self.batch_size :]
                     num_remaining -= 1
             if len(batch_idx) > 0:
                 yield batch_idx
diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index 54e9742ce..bb0d80f38 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -21,6 +21,7 @@ from torch.utils.data import Dataset
 import numpy as np
 from utils import logger
 
+
 class LmSeqsDataset(Dataset):
     """Custom Dataset wrapping language modeling sequences.
 
@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
         data: `List[np.array[int]]
     """
 
-    def __init__(self,
-                 params,
-                 data):
+    def __init__(self, params, data):
         self.params = params
 
         self.token_ids = np.array(data)
@@ -57,7 +56,7 @@ class LmSeqsDataset(Dataset):
         Some sanity checks
         """
         assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
 
     def remove_long_sequences(self):
         """
@@ -65,17 +64,17 @@ class LmSeqsDataset(Dataset):
         """
         max_len = self.params.max_model_input_size
         indices = self.lengths > max_len
-        logger.info(f'Splitting {sum(indices)} too long sequences.')
+        logger.info(f"Splitting {sum(indices)} too long sequences.")
 
         def divide_chunks(l, n):
-            return [l[i:i + n] for i in range(0, len(l), n)]
+            return [l[i : i + n] for i in range(0, len(l), n)]
 
         new_tok_ids = []
         new_lengths = []
         if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
         else:
-            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
+            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
 
         for seq_, len_ in zip(self.token_ids, self.lengths):
             assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
@@ -84,7 +83,7 @@ class LmSeqsDataset(Dataset):
                 new_lengths.append(len_)
             else:
                 sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len-2):
+                for sub_s in divide_chunks(seq_, max_len - 2):
                     if sub_s[0] != cls_id:
                         sub_s = np.insert(sub_s, 0, cls_id)
                     if sub_s[-1] != sep_id:
@@ -108,7 +107,7 @@ class LmSeqsDataset(Dataset):
         self.token_ids = self.token_ids[indices]
         self.lengths = self.lengths[indices]
         new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
+        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
 
     def print_statistics(self):
         """
@@ -116,7 +115,7 @@ class LmSeqsDataset(Dataset):
         """
         if not self.params.is_master:
             return
-        logger.info(f'{len(self)} sequences')
+        logger.info(f"{len(self)} sequences")
         # data_len = sum(self.lengths)
         # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
         # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
@@ -125,8 +124,7 @@ class LmSeqsDataset(Dataset):
         # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
         # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
 
-    def batch_sequences(self,
-                        batch):
+    def batch_sequences(self, batch):
         """
         Do the padding and transform into torch.tensor.
         """
@@ -139,13 +137,13 @@ class LmSeqsDataset(Dataset):
 
         # Pad token ids
         if self.params.mlm:
-            pad_idx = self.params.special_tok_ids['pad_token']
+            pad_idx = self.params.special_tok_ids["pad_token"]
         else:
-            pad_idx = self.params.special_tok_ids['unk_token']
-        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
+            pad_idx = self.params.special_tok_ids["unk_token"]
+        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
         assert len(tk_) == len(token_ids)
         assert all(len(t) == max_seq_len_ for t in tk_)
 
-        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
         lg_t = torch.tensor(lengths)  # (bs)
         return tk_t, lg_t
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 70b65dc1b..0d5a004eb 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -25,8 +25,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 import torch.nn.functional as F
 import torch.nn as nn
@@ -38,19 +37,32 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from ..utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+from ..utils_squad import (
+    read_squad_examples,
+    convert_examples_to_features,
+    RawResult,
+    write_predictions,
+    RawResultExtended,
+    write_predictions_extended,
+)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -59,16 +71,18 @@ from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 
+
 def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -76,9 +90,11 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
+
 def train(args, train_dataset, model, tokenizer, teacher=None):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -95,13 +111,18 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -115,17 +136,21 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -141,40 +166,47 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
             if teacher is not None:
                 teacher.eval()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1], 
-                      'start_positions': batch[3], 
-                      'end_positions':   batch[4]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
             outputs = model(**inputs)
             loss, start_logits_stu, end_logits_stu = outputs
 
             # Distillation loss
             if teacher is not None:
-                if 'token_type_ids' not in inputs:
-                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                if "token_type_ids" not in inputs:
+                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
                 with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
-                                                               token_type_ids=inputs['token_type_ids'],
-                                                               attention_mask=inputs['attention_mask'])
+                    start_logits_tea, end_logits_tea = teacher(
+                        input_ids=inputs["input_ids"],
+                        token_type_ids=inputs["token_type_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
                 assert start_logits_tea.size() == start_logits_stu.size()
                 assert end_logits_tea.size() == end_logits_stu.size()
-                
-                loss_fct = nn.KLDivLoss(reduction='batchmean')
-                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
-                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
-                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_ce = (loss_start + loss_end)/2.
 
-                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+                loss_fct = nn.KLDivLoss(reduction="batchmean")
+                loss_start = loss_fct(
+                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    F.softmax(start_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_end = loss_fct(
+                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                    F.softmax(end_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_ce = (loss_start + loss_end) / 2.0
+
+                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -195,22 +227,26 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,32 +282,31 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
             outputs = model(**inputs)
 
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                 # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                result = RawResultExtended(
+                    unique_id=unique_id,
+                    start_top_log_probs=to_list(outputs[0][i]),
+                    start_top_index=to_list(outputs[1][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
             else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                result = RawResult(
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
+                )
             all_results.append(result)
 
     # Compute predictions
@@ -282,23 +317,44 @@ def evaluate(args, model, tokenizer, prefix=""):
     else:
         output_null_log_odds_file = None
 
-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
         # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+        write_predictions_extended(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        write_predictions(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+        write_predictions(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )
 
     # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
+    evaluate_options = EVAL_OPTS(
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
+    )
     results = evaluate_on_squad(evaluate_options)
     return results
 
@@ -309,24 +365,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        examples = read_squad_examples(
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
+        )
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -342,14 +404,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
     else:
         all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )
 
     if output_examples:
         return dataset, examples, features
@@ -360,121 +429,213 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     # Distillation parameters (optional)
-    parser.add_argument('--teacher_type', default=None, type=str,
-                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
-    parser.add_argument('--teacher_name_or_path', default=None, type=str,
-                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
-    parser.add_argument('--alpha_ce', default=0.5, type=float,
-                        help="Distillation loss linear weight. Only for distillation.")
-    parser.add_argument('--alpha_squad', default=0.5, type=float,
-                        help="True SQuAD loss linear weight. Only for distillation.")
-    parser.add_argument('--temperature', default=2.0, type=float,
-                        help="Distillation temperature. Only for distillation.")
+    parser.add_argument(
+        "--teacher_type",
+        default=None,
+        type=str,
+        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
+    )
+    parser.add_argument(
+        "--teacher_name_or_path",
+        default=None,
+        type=str,
+        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
+    )
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -486,16 +647,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -506,27 +675,34 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.teacher_type is not None:
         assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.
-        assert args.alpha_ce + args.alpha_squad > 0.
-        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        assert args.alpha_ce > 0.0
+        assert args.alpha_ce + args.alpha_squad > 0.0
+        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
         teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path,
-                                                              cache_dir=args.cache_dir if args.cache_dir else None)
-        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path,
-                                                      config=teacher_config,
-                                                      cache_dir=args.cache_dir if args.cache_dir else None)
+        teacher_config = teacher_config_class.from_pretrained(
+            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
+        )
+        teacher = teacher_model_class.from_pretrained(
+            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
+        )
         teacher.to(args.device)
     else:
         teacher = None
@@ -544,7 +720,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -554,41 +729,44 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir, cache_dir=args.cache_dir if args.cache_dir else None)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
-                                                    do_lower_case=args.do_lower_case,
-                                                    cache_dir=args.cache_dir if args.cache_dir else None)
+        tokenizer = tokenizer_class.from_pretrained(
+            args.output_dir, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None
+        )
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint, cache_dir=args.cache_dir if args.cache_dir else None)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 681cc2de3..40bde7d15 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -23,68 +23,65 @@ import numpy as np
 from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
 import logging
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
+
 def main():
-    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
-    parser.add_argument('--file_path', type=str, default='data/dump.txt',
-                        help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
-    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
-                        help="The tokenizer to use.")
-    parser.add_argument('--dump_file', type=str, default='data/dump',
-                        help='The dump file prefix.')
+    parser = argparse.ArgumentParser(
+        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
+    )
+    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
+    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
+    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
+    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
     args = parser.parse_args()
 
-
-    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
-    if args.tokenizer_type == 'bert':
+    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
+    if args.tokenizer_type == "bert":
         tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
-        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
-    elif args.tokenizer_type == 'roberta':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
+    elif args.tokenizer_type == "roberta":
         tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
-        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
-    elif args.tokenizer_type == 'gpt2':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
+    elif args.tokenizer_type == "gpt2":
         tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
+        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`
 
-    logger.info(f'Loading text from {args.file_path}')
-    with open(args.file_path, 'r', encoding='utf8') as fp:
+    logger.info(f"Loading text from {args.file_path}")
+    with open(args.file_path, "r", encoding="utf8") as fp:
         data = fp.readlines()
 
-
-    logger.info(f'Start encoding')
-    logger.info(f'{len(data)} examples to process.')
+    logger.info(f"Start encoding")
+    logger.info(f"{len(data)} examples to process.")
 
     rslt = []
     iter = 0
     interval = 10000
     start = time.time()
     for text in data:
-        text = f'{bos} {text.strip()} {sep}'
+        text = f"{bos} {text.strip()} {sep}"
         token_ids = tokenizer.encode(text, add_special_tokens=False)
         rslt.append(token_ids)
 
         iter += 1
         if iter % interval == 0:
             end = time.time()
-            logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
+            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
             start = time.time()
-    logger.info('Finished binarization')
-    logger.info(f'{len(data)} examples processed.')
-
+    logger.info("Finished binarization")
+    logger.info(f"{len(data)} examples processed.")
 
-    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
+    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
     rslt_ = [np.uint16(d) for d in rslt]
     random.shuffle(rslt_)
-    logger.info(f'Dump to {dp_file}')
-    with open(dp_file, 'wb') as handle:
+    logger.info(f"Dump to {dp_file}")
+    with open(dp_file, "wb") as handle:
         pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
 
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index 5ae1607f3..9610f8f17 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -20,70 +20,80 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
 import torch
 import argparse
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
+    )
     parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default='roberta-large', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="roberta-large", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
     args = parser.parse_args()
 
-
-    if args.model_type == 'roberta':
+    if args.model_type == "roberta":
         model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'roberta'
-    elif args.model_type == 'gpt2':
+        prefix = "roberta"
+    elif args.model_type == "gpt2":
         model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = 'transformer'
+        prefix = "transformer"
 
     state_dict = model.state_dict()
     compressed_sd = {}
 
     ### Embeddings ###
-    if args.model_type == 'gpt2':
-        for param_name in ['wte.weight', 'wpe.weight']:
-            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    if args.model_type == "gpt2":
+        for param_name in ["wte.weight", "wpe.weight"]:
+            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
     else:
-        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
-            param_name = f'{prefix}.embeddings.{w}.weight'
+        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
+            param_name = f"{prefix}.embeddings.{w}.weight"
             compressed_sd[param_name] = state_dict[param_name]
-        for w in ['weight', 'bias']:
-            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+        for w in ["weight", "bias"]:
+            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
             compressed_sd[param_name] = state_dict[param_name]
 
     ### Transformer Blocks ###
     std_idx = 0
     for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == 'gpt2':
-            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
-            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        if args.model_type == "gpt2":
+            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
+                    ]
+            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
         else:
-            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
-                        'attention.output.dense', 'attention.output.LayerNorm',
-                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+            for layer in [
+                "attention.self.query",
+                "attention.self.key",
+                "attention.self.value",
+                "attention.output.dense",
+                "attention.output.LayerNorm",
+                "intermediate.dense",
+                "output.dense",
+                "output.LayerNorm",
+            ]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
+                    ]
         std_idx += 1
 
     ### Language Modeling Head ###s
-    if args.model_type == 'roberta':
-        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
-            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+    if args.model_type == "roberta":
+        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
+            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
         if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
-                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
-    elif args.model_type == 'gpt2':
-        for w in ['weight', 'bias']:
-            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
-        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+            for w in ["weight", "bias"]:
+                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
+                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
+    elif args.model_type == "gpt2":
+        for w in ["weight", "bias"]:
+            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
+        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
 
-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
 
-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
     torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py
index fdb0662ca..8e58db555 100644
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -20,63 +20,70 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
+    )
     parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
     args = parser.parse_args()
 
-
-    if args.model_type == 'bert':
+    if args.model_type == "bert":
         model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'bert'
+        prefix = "bert"
     else:
         raise ValueError(f'args.model_type should be "bert".')
 
     state_dict = model.state_dict()
     compressed_sd = {}
 
-    for w in ['word_embeddings', 'position_embeddings']:
-        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'{prefix}.embeddings.{w}.weight']
-    for w in ['weight', 'bias']:
-        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
+    for w in ["word_embeddings", "position_embeddings"]:
+        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
+    for w in ["weight", "bias"]:
+        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
 
     std_idx = 0
     for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
+            ]
 
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
+            ]
 
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
+            ]
         std_idx += 1
 
-    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
+    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
     if args.vocab_transform:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
+            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
 
-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
 
-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
     torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index d9de17da4..623caad4b 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -20,32 +20,36 @@ import argparse
 import pickle
 import logging
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
-    parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
-                        help="The binarized dataset.")
-    parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
-                        help="The dump file.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
+    )
+    parser.add_argument(
+        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
+    )
+    parser.add_argument(
+        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
+    )
     parser.add_argument("--vocab_size", default=30522, type=int)
     args = parser.parse_args()
 
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
 
-    logger.info('Counting occurences for MLM.')
+    logger.info("Counting occurences for MLM.")
     counter = Counter()
     for tk_ids in data:
         counter.update(tk_ids)
-    counts = [0]*args.vocab_size
+    counts = [0] * args.vocab_size
     for k, v in counter.items():
         counts[k] = v
 
-    logger.info(f'Dump to {args.token_counts_dump}')
-    with open(args.token_counts_dump, 'wb') as handle:
+    logger.info(f"Dump to {args.token_counts_dump}")
+    with open(args.token_counts_dump, "wb") as handle:
         pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 311f0580f..37c49ae7b 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -35,166 +35,200 @@ from lm_seqs_dataset import LmSeqsDataset
 
 
 MODEL_CLASSES = {
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 }
 
+
 def sanity_checks(args):
     """
     A bunch of args sanity checks to perform even starting...
     """
-    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
-    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
+    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
     if args.mlm:
         assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
     else:
-        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
 
-    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert args.teacher_type == args.student_type or (
+        args.student_type == "distilbert" and args.teacher_type == "bert"
+    )
     assert os.path.isfile(args.student_config)
     if args.student_pretrained_weights is not None:
         assert os.path.isfile(args.student_pretrained_weights)
 
-    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+    if args.freeze_token_type_embds:
+        assert args.student_type in ["roberta"]
+
+    assert args.alpha_ce >= 0.0
+    assert args.alpha_mlm >= 0.0
+    assert args.alpha_clm >= 0.0
+    assert args.alpha_mse >= 0.0
+    assert args.alpha_cos >= 0.0
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
 
-    assert args.alpha_ce >= 0.
-    assert args.alpha_mlm >= 0.
-    assert args.alpha_clm >= 0.
-    assert args.alpha_mse >= 0.
-    assert args.alpha_cos >= 0.
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
 
 def freeze_pos_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
         student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == 'gpt2':
+    elif args.student_type == "gpt2":
         student.transformer.wpe.weight.requires_grad = False
 
+
 def freeze_token_type_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
         student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
 
+
 def main():
     parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")
-
-    parser.add_argument("--dump_path", type=str, required=True,
-                        help="The output directory (log, checkpoints, parameters, etc.)")
-    parser.add_argument("--data_file", type=str, required=True,
-                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
-
-    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
-                        help="The student type (DistilBERT, RoBERTa).")
-    parser.add_argument("--student_config", type=str, required=True,
-                        help="Path to the student configuration.")
-    parser.add_argument("--student_pretrained_weights", default=None, type=str,
-                        help="Load student initialization checkpoint.")
-
-    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
-                        help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", type=str, required=True,
-                        help="The teacher model.")
-
-    parser.add_argument("--temperature", default=2., type=float,
-                        help="Temperature for the softmax temperature.")
-    parser.add_argument("--alpha_ce", default=0.5, type=float,
-                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.0, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
-    parser.add_argument("--alpha_clm", default=0.5, type=float,
-                        help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float,
-                        help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument("--alpha_cos", default=0.0, type=float,
-                        help="Linear weight of the cosine embedding loss. Must be >=0.")
-
-    parser.add_argument("--mlm", action="store_true",
-                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
-    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
-                        help="Proportion of tokens for which we need to make a prediction.")
-    parser.add_argument("--word_mask", default=0.8, type=float,
-                        help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float,
-                        help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float,
-                        help="Proportion of tokens to randomly replace.")
-    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
-                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
-    parser.add_argument("--token_counts", type=str,
-                        help="The token counts in the data_file for MLM.")
-
-    parser.add_argument("--restrict_ce_to_mask", action='store_true',
-                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
-    parser.add_argument("--freeze_pos_embs", action="store_true",
-                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
-    parser.add_argument("--freeze_token_type_embds", action="store_true",
-                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
-
-    parser.add_argument("--n_epoch", type=int, default=3,
-                        help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5,
-                        help="Batch size (for each process).")
-    parser.add_argument("--group_by_size", action='store_false',
-                        help="If true, group sequences that have similar length into the same batch. Default is true.")
-
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
-                        help="Gradient accumulation for larger training batches.")
-    parser.add_argument("--warmup_prop", default=0.05, type=float,
-                        help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float,
-                        help="Random initialization range.")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--n_gpu", type=int, default=1,
-                        help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56,
-                        help="Random seed")
-
-    parser.add_argument("--log_interval", type=int, default=500,
-                        help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000,
-                        help="Checkpoint interval.")
+    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
+
+    parser.add_argument(
+        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
+    )
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        required=True,
+        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
+    )
+
+    parser.add_argument(
+        "--student_type",
+        type=str,
+        choices=["distilbert", "roberta", "gpt2"],
+        required=True,
+        help="The student type (DistilBERT, RoBERTa).",
+    )
+    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
+    parser.add_argument(
+        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
+    )
+
+    parser.add_argument(
+        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
+    )
+    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
+
+    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
+    )
+    parser.add_argument(
+        "--alpha_mlm",
+        default=0.0,
+        type=float,
+        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
+    )
+    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
+    parser.add_argument(
+        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
+    )
+
+    parser.add_argument(
+        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
+    )
+    parser.add_argument(
+        "--mlm_mask_prop",
+        default=0.15,
+        type=float,
+        help="Proportion of tokens for which we need to make a prediction.",
+    )
+    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
+    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
+    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
+    parser.add_argument(
+        "--mlm_smoothing",
+        default=0.7,
+        type=float,
+        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
+    )
+    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
+
+    parser.add_argument(
+        "--restrict_ce_to_mask",
+        action="store_true",
+        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
+    )
+    parser.add_argument(
+        "--freeze_pos_embs",
+        action="store_true",
+        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
+    )
+    parser.add_argument(
+        "--freeze_token_type_embds",
+        action="store_true",
+        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
+    )
+
+    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
+    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
+    parser.add_argument(
+        "--group_by_size",
+        action="store_false",
+        help="If true, group sequences that have similar length into the same batch. Default is true.",
+    )
+
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=50,
+        help="Gradient accumulation for larger training batches.",
+    )
+    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
+    parser.add_argument("--seed", type=int, default=56, help="Random seed")
+
+    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
+    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
     args = parser.parse_args()
     sanity_checks(args)
 
-
     ## ARGS ##
     init_gpu_params(args)
     set_seed(args)
     if args.is_master:
         if os.path.exists(args.dump_path):
             if not args.force:
-                raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
-                                   'Use `--force` if you want to overwrite it')
+                raise ValueError(
+                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
+                    "Use `--force` if you want to overwrite it"
+                )
             else:
                 shutil.rmtree(args.dump_path)
 
         if not os.path.exists(args.dump_path):
             os.makedirs(args.dump_path)
-        logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
-
+        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
 
         ### SAVE PARAMS ###
-        logger.info(f'Param: {args}')
-        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
+        logger.info(f"Param: {args}")
+        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
             json.dump(vars(args), f, indent=4)
         git_log(args.dump_path)
 
@@ -207,58 +241,50 @@ def main():
     for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
         idx = tokenizer.all_special_tokens.index(tok_symbol)
         special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f'Special tokens {special_tok_ids}')
+    logger.info(f"Special tokens {special_tok_ids}")
     args.special_tok_ids = special_tok_ids
     args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
 
-
     ## DATA LOADER ##
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
 
-
     if args.mlm:
-        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-        with open(args.token_counts, 'rb') as fp:
+        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
+        with open(args.token_counts, "rb") as fp:
             counts = pickle.load(fp)
-        
+
         token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
         for idx in special_tok_ids.values():
-            token_probs[idx] = 0.  # do not predict special tokens
+            token_probs[idx] = 0.0  # do not predict special tokens
         token_probs = torch.from_numpy(token_probs)
     else:
         token_probs = None
 
-
     train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f'Data loader created.')
-
+    logger.info(f"Data loader created.")
 
     ## STUDENT ##
-    logger.info(f'Loading student config from {args.student_config}')
+    logger.info(f"Loading student config from {args.student_config}")
     stu_architecture_config = student_config_class.from_pretrained(args.student_config)
     stu_architecture_config.output_hidden_states = True
 
     if args.student_pretrained_weights is not None:
-        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
-        student = student_model_class.from_pretrained(args.student_pretrained_weights,
-                                                      config=stu_architecture_config)
+        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
+        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
     else:
         student = student_model_class(stu_architecture_config)
 
-
     if args.n_gpu > 0:
-        student.to(f'cuda:{args.local_rank}')
-    logger.info(f'Student loaded.')
-
+        student.to(f"cuda:{args.local_rank}")
+    logger.info(f"Student loaded.")
 
     ## TEACHER ##
     teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
     if args.n_gpu > 0:
-        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.teacher_name}.')
-
+        teacher.to(f"cuda:{args.local_rank}")
+    logger.info(f"Teacher loaded from {args.teacher_name}.")
 
     ## FREEZING ##
     if args.freeze_pos_embs:
@@ -266,7 +292,6 @@ def main():
     if args.freeze_token_type_embds:
         freeze_token_type_embeddings(student, args)
 
-
     ## SANITY CHECKS ##
     assert student.config.vocab_size == teacher.config.vocab_size
     assert student.config.hidden_size == teacher.config.hidden_size
@@ -274,14 +299,11 @@ def main():
     if args.mlm:
         assert token_probs.size(0) == stu_architecture_config.vocab_size
 
-
     ## DISTILLER ##
     torch.cuda.empty_cache()
-    distiller = Distiller(params=args,
-                          dataset=train_lm_seq_dataset,
-                          token_probs=token_probs,
-                          student=student,
-                          teacher=teacher)
+    distiller = Distiller(
+        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
+    )
     distiller.train()
     logger.info("Let's go get some drinks.")
 
diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
index 3d6250471..f9d7412cb 100644
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -23,9 +23,12 @@ import torch
 import numpy as np
 
 import logging
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
 logger = logging.getLogger(__name__)
 
 
@@ -35,12 +38,12 @@ def git_log(folder_path: str):
     """
     repo = git.Repo(search_parent_directories=True)
     repo_infos = {
-        'repo_id': str(repo),
-        'repo_sha': str(repo.head.object.hexsha),
-        'repo_branch': str(repo.active_branch)
+        "repo_id": str(repo),
+        "repo_sha": str(repo.head.object.hexsha),
+        "repo_branch": str(repo.active_branch),
     }
 
-    with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
+    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
         json.dump(repo_infos, f, indent=4)
 
 
@@ -57,21 +60,21 @@ def init_gpu_params(params):
 
     assert torch.cuda.is_available()
 
-    logger.info('Initializing GPUs')
+    logger.info("Initializing GPUs")
     if params.n_gpu > 1:
         assert params.local_rank != -1
 
-        params.world_size = int(os.environ['WORLD_SIZE'])
-        params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
-        params.global_rank = int(os.environ['RANK'])
+        params.world_size = int(os.environ["WORLD_SIZE"])
+        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
+        params.global_rank = int(os.environ["RANK"])
 
         # number of nodes / node ID
         params.n_nodes = params.world_size // params.n_gpu_per_node
         params.node_id = params.global_rank // params.n_gpu_per_node
         params.multi_gpu = True
 
-        assert params.n_nodes == int(os.environ['N_NODES'])
-        assert params.node_id == int(os.environ['NODE_RANK'])
+        assert params.n_nodes == int(os.environ["N_NODES"])
+        assert params.node_id == int(os.environ["NODE_RANK"])
 
     # local job (single GPU)
     else:
@@ -114,8 +117,7 @@ def init_gpu_params(params):
     if params.multi_gpu:
         logger.info("Initializing PyTorch distributed")
         torch.distributed.init_process_group(
-            init_method='env://',
-            backend='nccl',
+            init_method="env://", backend="nccl",
         )
 
 
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index f4a44bf62..c92dbd3d3 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -40,29 +40,49 @@ from tqdm import tqdm, trange
 
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_mmimdb_labels, get_image_transforms
 
-from transformers import (WEIGHTS_NAME,
-                          BertConfig, BertModel, BertTokenizer,
-                          RobertaConfig, RobertaModel, RobertaTokenizer,
-                          XLMConfig, XLMModel, XLMTokenizer,
-                          XLNetConfig, XLNetModel, XLNetTokenizer,
-                          DistilBertConfig, DistilBertModel, DistilBertTokenizer,
-                          AlbertConfig, AlbertModel, AlbertTokenizer,
-                          MMBTForClassification, MMBTConfig)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertModel,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMModel,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
+    MMBTForClassification,
+    MMBTConfig,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertModel, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMModel, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertModel, AlbertTokenizer)
+    "bert": (BertConfig, BertModel, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
 }
 
 
@@ -81,10 +101,13 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
-                                  batch_size=args.train_batch_size,
-                                  collate_fn=collate_fn,
-                                  num_workers=args.num_workers)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=args.train_batch_size,
+        collate_fn=collate_fn,
+        num_workers=args.num_workers,
+    )
 
     if args.max_steps > 0:
         t_total = args.max_steps
@@ -93,14 +116,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
 
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -114,17 +142,21 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -140,17 +172,19 @@ def train(args, train_dataset, model, tokenizer, criterion):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             labels = batch[5]
-            inputs = {'input_ids':      batch[0],
-                      'input_modal':    batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
             outputs = model(**inputs)
             logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
             loss = criterion(logits, labels)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -174,30 +208,34 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer, criterion)
                         for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                             logs[eval_key] = value
 
                     loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                     learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                     logging_loss = tr_loss
 
                     for key, value in logs.items():
                         tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -209,13 +247,13 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
         if args.local_rank == -1:
             results = evaluate(args, model, tokenizer, criterion)
-            if results['micro_f1'] > best_f1:
-                best_f1 = results['micro_f1']
+            if results["micro_f1"] > best_f1:
+                best_f1 = results["micro_f1"]
                 n_no_improve = 0
             else:
                 n_no_improve += 1
 
-            if  n_no_improve > args.patience:
+            if n_no_improve > args.patience:
                 train_iterator.close()
                 break
 
@@ -236,7 +274,9 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
+    )
 
     # multi-gpu eval
     if args.n_gpu > 1:
@@ -257,11 +297,13 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
         with torch.no_grad():
             batch = tuple(t.to(args.device) for t in batch)
             labels = batch[5]
-            inputs = {'input_ids': batch[0],
-                      'input_modal': batch[2],
-                      'attention_mask': batch[1],
-                      'modal_start_tokens': batch[3],
-                      'modal_end_tokens': batch[4]}
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
             outputs = model(**inputs)
             logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
             tmp_eval_loss = criterion(logits, labels)
@@ -278,7 +320,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
     result = {
         "loss": eval_loss,
         "macro_f1": f1_score(out_label_ids, preds, average="macro"),
-        "micro_f1": f1_score(out_label_ids, preds, average="micro")
+        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
     }
 
     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
@@ -303,94 +345,147 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .jsonl files for MMIMDB.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--num_image_embeds", default=1, type=int,
-                        help="Number of Image Embeddings from the Image Encoder")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--patience", default=5, type=int,
-                        help="Patience for Early Stopping.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--num_workers', type=int, default=8,
-                        help="number of worker threads for dataloading")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -402,17 +497,25 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
 
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -426,13 +529,17 @@ def main():
     num_labels = len(labels)
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    transformer = model_class.from_pretrained(args.model_name_or_path,
-                                              config=transformer_config,
-                                              cache_dir=args.cache_dir if args.cache_dir else None)
+    transformer_config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    transformer = model_class.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    )
     img_encoder = ImageEncoder(args)
     config = MMBTConfig(transformer_config, num_labels=num_labels)
     model = MMBTForClassification(config, transformer, img_encoder)
@@ -449,12 +556,13 @@ def main():
         train_dataset = load_examples(args, tokenizer, evaluate=False)
         label_frequences = train_dataset.get_label_frequencies()
         label_frequences = [label_frequences[l] for l in labels]
-        label_weights = (torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)) ** -1
+        label_weights = (
+            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
+        ) ** -1
         criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -464,12 +572,14 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = MMBTForClassification(config, transformer, img_encoder)
@@ -477,24 +587,25 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
             model = MMBTForClassification(config, transformer, img_encoder)
             model.load_state_dict(torch.load(checkpoint))
             model.to(args.device)
             result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py
index c59da0264..57cee25f9 100644
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -25,17 +25,7 @@ import torchvision
 import torchvision.transforms as transforms
 from torch.utils.data import Dataset
 
-POOLING_BREAKDOWN = {
-    1: (1, 1),
-    2: (2, 1),
-    3: (3, 1),
-    4: (2, 2),
-    5: (5, 1),
-    6: (3, 2),
-    7: (7, 1),
-    8: (4, 2),
-    9: (3, 3)
-}
+POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
 
 
 class ImageEncoder(nn.Module):
@@ -54,7 +44,6 @@ class ImageEncoder(nn.Module):
         return out  # BxNx2048
 
 
-
 class JsonlDataset(Dataset):
     def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
         self.data = [json.loads(l) for l in open(data_path)]
@@ -72,7 +61,7 @@ class JsonlDataset(Dataset):
     def __getitem__(self, index):
         sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
         start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
-        sentence = sentence[:self.max_seq_length]
+        sentence = sentence[: self.max_seq_length]
 
         label = torch.zeros(self.n_classes)
         label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
@@ -80,8 +69,13 @@ class JsonlDataset(Dataset):
         image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
         image = self.transforms(image)
 
-        return {"image_start_token": start_token, "image_end_token": end_token,
-                "sentence": sentence, "image": image, "label": label}
+        return {
+            "image_start_token": start_token,
+            "image_end_token": end_token,
+            "sentence": sentence,
+            "image": image,
+            "label": label,
+        }
 
     def get_label_frequencies(self):
         label_freqs = Counter()
@@ -110,10 +104,31 @@ def collate_fn(batch):
 
 
 def get_mmimdb_labels():
-    return ['Crime', 'Drama', 'Thriller', 'Action', 'Comedy', 'Romance',
-            'Documentary', 'Short', 'Mystery', 'History', 'Family', 'Adventure',
-            'Fantasy', 'Sci-Fi', 'Western', 'Horror', 'Sport', 'War', 'Music',
-            'Musical', 'Animation', 'Biography', 'Film-Noir']
+    return [
+        "Crime",
+        "Drama",
+        "Thriller",
+        "Action",
+        "Comedy",
+        "Romance",
+        "Documentary",
+        "Short",
+        "Mystery",
+        "History",
+        "Family",
+        "Adventure",
+        "Fantasy",
+        "Sci-Fi",
+        "Western",
+        "Horror",
+        "Sport",
+        "War",
+        "Music",
+        "Musical",
+        "Animation",
+        "Biography",
+        "Film-Noir",
+    ]
 
 
 def get_image_transforms():
@@ -122,9 +137,6 @@ def get_image_transforms():
             transforms.Resize(256),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.46777044, 0.44531429, 0.40661017],
-                std=[0.12221994, 0.12145835, 0.14380469],
-            ),
+            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
         ]
     )
diff --git a/examples/pplm/pplm_classification_head.py b/examples/pplm/pplm_classification_head.py
index 9aae0f17e..05621c3bf 100644
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
@@ -1,5 +1,6 @@
 import torch
 
+
 class ClassificationHead(torch.nn.Module):
     """Classification Head for  transformer encoders"""
 
diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py
index 095dc39a7..37183a512 100644
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8
 
-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """
 Example command with bag of words:
@@ -46,13 +46,13 @@ SMALL_CONST = 1e-15
 BIG_CONST = 1e10
 
 BAG_OF_WORDS_ARCHIVE_MAP = {
-    'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
-    'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
-    'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
-    'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
-    'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
-    'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
-    'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
+    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
+    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
+    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
+    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
+    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
+    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
+    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
 }
 
 DISCRIMINATOR_MODELS_PARAMS = {
@@ -75,10 +75,10 @@ DISCRIMINATOR_MODELS_PARAMS = {
 }
 
 
-def to_var(x, requires_grad=False, volatile=False, device='cuda'):
-    if torch.cuda.is_available() and device == 'cuda':
+def to_var(x, requires_grad=False, volatile=False, device="cuda"):
+    if torch.cuda.is_available() and device == "cuda":
         x = x.cuda()
-    elif device != 'cuda':
+    elif device != "cuda":
         x = x.to(device)
     return Variable(x, requires_grad=requires_grad, volatile=volatile)
 
@@ -95,49 +95,39 @@ def top_k_filter(logits, k, probs=False):
         values = torch.topk(logits, k)[0]
         batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
         if probs:
-            return torch.where(logits < batch_mins,
-                               torch.ones_like(logits) * 0.0, logits)
-        return torch.where(logits < batch_mins,
-                           torch.ones_like(logits) * -BIG_CONST,
-                           logits)
+            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
+        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
 
 
 def perturb_past(
-        past,
-        model,
-        last,
-        unpert_past=None,
-        unpert_logits=None,
-        accumulated_hidden=None,
-        grad_norms=None,
-        stepsize=0.01,
-        one_hot_bows_vectors=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        num_iterations=3,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        kl_scale=0.01,
-        device='cuda',
+    past,
+    model,
+    last,
+    unpert_past=None,
+    unpert_logits=None,
+    accumulated_hidden=None,
+    grad_norms=None,
+    stepsize=0.01,
+    one_hot_bows_vectors=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    num_iterations=3,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    kl_scale=0.01,
+    device="cuda",
 ):
     # Generate inital perturbed past
-    grad_accumulator = [
-        (np.zeros(p.shape).astype("float32"))
-        for p in past
-    ]
+    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
 
     if accumulated_hidden is None:
         accumulated_hidden = 0
 
     if decay:
-        decay_mask = torch.arange(
-            0.,
-            1.0 + SMALL_CONST,
-            1.0 / (window_length)
-        )[1:]
+        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
     else:
         decay_mask = 1.0
 
@@ -146,26 +136,17 @@ def perturb_past(
     _, _, _, curr_length, _ = past[0].shape
 
     if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([window_length])
-                + tuple(past[0].shape[-1:])
-        )
+        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
 
         zeros_key_val_shape = (
-                tuple(past[0].shape[:-2])
-                + tuple([curr_length - window_length])
-                + tuple(past[0].shape[-1:])
+            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
         )
 
         ones_mask = torch.ones(ones_key_val_shape)
         ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
         ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
 
-        window_mask = torch.cat(
-            (ones_mask, torch.zeros(zeros_key_val_shape)),
-            dim=-2
-        ).to(device)
+        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
     else:
         window_mask = torch.ones_like(past[0]).to(device)
 
@@ -175,8 +156,7 @@ def perturb_past(
     for i in range(num_iterations):
         print("Iteration ", i + 1)
         curr_perturbation = [
-            to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-            for p_ in grad_accumulator
+            to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
         ]
 
         # Compute hidden using perturbed past
@@ -184,10 +164,7 @@ def perturb_past(
         _, _, _, curr_length, _ = curr_perturbation[0].shape
         all_logits, _, all_hidden = model(last, past=perturbed_past)
         hidden = all_hidden[-1]
-        new_accumulated_hidden = accumulated_hidden + torch.sum(
-            hidden,
-            dim=1
-        ).detach()
+        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
         # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
         logits = all_logits[:, -1, :]
         probs = F.softmax(logits, dim=-1)
@@ -210,20 +187,13 @@ def perturb_past(
             wte = model.resize_token_embeddings()
             for _ in range(horizon_length):
                 inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
-                _, curr_unpert_past, curr_all_hidden = model(
-                    past=curr_unpert_past,
-                    inputs_embeds=inputs_embeds
-                )
+                _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
                 curr_hidden = curr_all_hidden[-1]
-                new_accumulated_hidden = new_accumulated_hidden + torch.sum(
-                    curr_hidden, dim=1)
+                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
 
-            prediction = classifier(new_accumulated_hidden /
-                                    (curr_length + 1 + horizon_length))
+            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
 
-            label = torch.tensor(prediction.shape[0] * [class_label],
-                                 device=device,
-                                 dtype=torch.long)
+            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
             discrim_loss = ce_loss(prediction, label)
             print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
             loss += discrim_loss
@@ -232,21 +202,15 @@ def perturb_past(
         kl_loss = 0.0
         if kl_scale > 0.0:
             unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
-            unpert_probs = (
-                    unpert_probs + SMALL_CONST *
-                    (unpert_probs <= SMALL_CONST).float().to(device).detach()
-            )
-            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
-                device).detach()
+            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
+            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
             corrected_probs = probs + correction.detach()
-            kl_loss = kl_scale * (
-                (corrected_probs * (corrected_probs / unpert_probs).log()).sum()
-            )
-            print(' kl_loss', kl_loss.data.cpu().numpy())
+            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
+            print(" kl_loss", kl_loss.data.cpu().numpy())
             loss += kl_loss
 
         loss_per_iter.append(loss.data.cpu().numpy())
-        print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
+        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
 
         # compute gradients
         loss.backward()
@@ -259,15 +223,12 @@ def perturb_past(
             ]
         else:
             grad_norms = [
-                (torch.norm(p_.grad * window_mask) + SMALL_CONST)
-                for index, p_ in enumerate(curr_perturbation)
+                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
             ]
 
         # normalize gradients
         grad = [
-            -stepsize *
-            (p_.grad * window_mask / grad_norms[
-                index] ** gamma).data.cpu().numpy()
+            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
             for index, p_ in enumerate(curr_perturbation)
         ]
 
@@ -285,36 +246,27 @@ def perturb_past(
         past = new_past
 
     # apply the accumulated perturbations to the past
-    grad_accumulator = [
-        to_var(torch.from_numpy(p_), requires_grad=True, device=device)
-        for p_ in grad_accumulator
-    ]
+    grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
     pert_past = list(map(add, past, grad_accumulator))
 
     return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
 
 
 def get_classifier(
-        name: Optional[str], class_label: Union[str, int],
-        device: str
+    name: Optional[str], class_label: Union[str, int], device: str
 ) -> Tuple[Optional[ClassificationHead], Optional[int]]:
     if name is None:
         return None, None
 
     params = DISCRIMINATOR_MODELS_PARAMS[name]
-    classifier = ClassificationHead(
-        class_size=params['class_size'],
-        embed_size=params['embed_size']
-    ).to(device)
+    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
     if "url" in params:
         resolved_archive_file = cached_path(params["url"])
     elif "path" in params:
         resolved_archive_file = params["path"]
     else:
-        raise ValueError("Either url or path have to be specified "
-                         "in the discriminator model parameters")
-    classifier.load_state_dict(
-        torch.load(resolved_archive_file, map_location=device))
+        raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
+    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
     classifier.eval()
 
     if isinstance(class_label, str):
@@ -341,8 +293,7 @@ def get_classifier(
     return classifier, label_id
 
 
-def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
-        List[List[List[int]]]:
+def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
     bow_indices = []
     for id_or_path in bag_of_words_ids_or_paths:
         if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
@@ -351,13 +302,11 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) ->
             filepath = id_or_path
         with open(filepath, "r") as f:
             words = f.read().strip().split("\n")
-        bow_indices.append(
-            [tokenizer.encode(word.strip(), add_prefix_space=True) for word in
-             words])
+        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
     return bow_indices
 
 
-def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
+def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
     if bow_indices is None:
         return None
 
@@ -373,39 +322,34 @@ def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
 
 
 def full_text_generation(
-        model,
-        tokenizer,
-        context=None,
-        num_samples=1,
-        device="cuda",
-        bag_of_words=None,
-        discrim=None,
-        class_label=None,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        **kwargs
+    model,
+    tokenizer,
+    context=None,
+    num_samples=1,
+    device="cuda",
+    bag_of_words=None,
+    discrim=None,
+    class_label=None,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    **kwargs
 ):
-    classifier, class_id = get_classifier(
-        discrim,
-        class_label,
-        device
-    )
+    classifier, class_id = get_classifier(discrim, class_label, device)
 
     bow_indices = []
     if bag_of_words:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
 
     if bag_of_words and classifier:
         print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
@@ -423,15 +367,9 @@ def full_text_generation(
         raise Exception("Specify either a bag of words or a discriminator")
 
     unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model,
-        tokenizer=tokenizer,
-        context=context,
-        device=device,
-        length=length,
-        sample=sample,
-        perturb=False
+        model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
     )
-    if device == 'cuda':
+    if device == "cuda":
         torch.cuda.empty_cache()
 
     pert_gen_tok_texts = []
@@ -468,36 +406,36 @@ def full_text_generation(
             discrim_losses.append(discrim_loss.data.cpu().numpy())
         losses_in_time.append(loss_in_time)
 
-    if device == 'cuda':
+    if device == "cuda":
         torch.cuda.empty_cache()
 
     return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
 
 
 def generate_text_pplm(
-        model,
-        tokenizer,
-        context=None,
-        past=None,
-        device="cuda",
-        perturb=True,
-        bow_indices=None,
-        classifier=None,
-        class_label=None,
-        loss_type=0,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
+    model,
+    tokenizer,
+    context=None,
+    past=None,
+    device="cuda",
+    perturb=True,
+    bow_indices=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
 ):
     output_so_far = None
     if context:
@@ -507,8 +445,7 @@ def generate_text_pplm(
         output_so_far = context_t
 
     # collect one hot vectors for bags of words
-    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
-                                                      device)
+    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
 
     grad_norms = None
     last = None
@@ -575,13 +512,9 @@ def generate_text_pplm(
         if classifier is not None:
             ce_loss = torch.nn.CrossEntropyLoss()
             prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
-            label = torch.tensor([class_label], device=device,
-                                 dtype=torch.long)
+            label = torch.tensor([class_label], device=device, dtype=torch.long)
             unpert_discrim_loss = ce_loss(prediction, label)
-            print(
-                "unperturbed discrim loss",
-                unpert_discrim_loss.data.cpu().numpy()
-            )
+            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
         else:
             unpert_discrim_loss = 0
 
@@ -590,10 +523,8 @@ def generate_text_pplm(
 
             unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
 
-            pert_probs = ((pert_probs ** gm_scale) * (
-                    unpert_probs ** (1 - gm_scale)))  # + SMALL_CONST
-            pert_probs = top_k_filter(pert_probs, k=top_k,
-                                      probs=True)  # + SMALL_CONST
+            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
+            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
 
             # rescale
             if torch.sum(pert_probs) <= 1:
@@ -611,10 +542,7 @@ def generate_text_pplm(
             _, last = torch.topk(pert_probs, k=1, dim=-1)
 
         # update context/output_so_far appending the new token
-        output_so_far = (
-            last if output_so_far is None
-            else torch.cat((output_so_far, last), dim=1)
-        )
+        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
 
         print(tokenizer.decode(output_so_far.tolist()[0]))
 
@@ -623,44 +551,42 @@ def generate_text_pplm(
 
 def set_generic_model_params(discrim_weights, discrim_meta):
     if discrim_weights is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_weights need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
     if discrim_meta is None:
-        raise ValueError('When using a generic discriminator, '
-                         'discrim_meta need to be specified')
+        raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")
 
-    with open(discrim_meta, 'r') as discrim_meta_file:
+    with open(discrim_meta, "r") as discrim_meta_file:
         meta = json.load(discrim_meta_file)
-    meta['path'] = discrim_weights
-    DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
+    meta["path"] = discrim_weights
+    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
 
 
 def run_pplm_example(
-        pretrained_model="gpt2-medium",
-        cond_text="",
-        uncond=False,
-        num_samples=1,
-        bag_of_words=None,
-        discrim=None,
-        discrim_weights=None,
-        discrim_meta=None,
-        class_label=-1,
-        length=100,
-        stepsize=0.02,
-        temperature=1.0,
-        top_k=10,
-        sample=False,
-        num_iterations=3,
-        grad_length=10000,
-        horizon_length=1,
-        window_length=0,
-        decay=False,
-        gamma=1.5,
-        gm_scale=0.9,
-        kl_scale=0.01,
-        seed=0,
-        no_cuda=False,
-        colorama=False
+    pretrained_model="gpt2-medium",
+    cond_text="",
+    uncond=False,
+    num_samples=1,
+    bag_of_words=None,
+    discrim=None,
+    discrim_weights=None,
+    discrim_meta=None,
+    class_label=-1,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    seed=0,
+    no_cuda=False,
+    colorama=False,
 ):
     # set Random seed
     torch.manual_seed(seed)
@@ -669,21 +595,15 @@ def run_pplm_example(
     # set the device
     device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
 
-    if discrim == 'generic':
+    if discrim == "generic":
         set_generic_model_params(discrim_weights, discrim_meta)
 
     if discrim is not None:
-        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
-            "pretrained_model"
-        ]
-        print("discrim = {}, pretrained_model set "
-              "to discriminator's = {}".format(discrim, pretrained_model))
+        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
+        print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))
 
     # load pretrained model
-    model = GPT2LMHeadModel.from_pretrained(
-        pretrained_model,
-        output_hidden_states=True
-    )
+    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
     model.to(device)
     model.eval()
 
@@ -696,9 +616,7 @@ def run_pplm_example(
 
     # figure out conditioning text
     if uncond:
-        tokenized_cond_text = tokenizer.encode(
-            [tokenizer.bos_token]
-        )
+        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
     else:
         raw_text = cond_text
         while not raw_text:
@@ -750,8 +668,7 @@ def run_pplm_example(
 
     bow_word_ids = set()
     if bag_of_words and colorama:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
-                                               tokenizer)
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
         for single_bow_list in bow_indices:
             # filtering all words in the list composed of more than 1 token
             filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
@@ -765,13 +682,11 @@ def run_pplm_example(
             if colorama:
                 import colorama
 
-                pert_gen_text = ''
+                pert_gen_text = ""
                 for word_id in pert_gen_tok_text.tolist()[0]:
                     if word_id in bow_word_ids:
-                        pert_gen_text += '{}{}{}'.format(
-                            colorama.Fore.RED,
-                            tokenizer.decode([word_id]),
-                            colorama.Style.RESET_ALL
+                        pert_gen_text += "{}{}{}".format(
+                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
                         )
                     else:
                         pert_gen_text += tokenizer.decode([word_id])
@@ -785,14 +700,12 @@ def run_pplm_example(
             pass
 
         # keep the prefix, perturbed seq, original seq for each index
-        generated_texts.append(
-            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
-        )
+        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
 
     return
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--pretrained_model",
@@ -801,19 +714,10 @@ if __name__ == '__main__':
         default="gpt2-medium",
         help="pretrained model name or path to local checkpoint",
     )
+    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
+    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
     parser.add_argument(
-        "--cond_text", type=str, default="The lake",
-        help="Prefix texts to condition on"
-    )
-    parser.add_argument(
-        "--uncond", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
-    parser.add_argument(
-        "--num_samples",
-        type=int,
-        default=1,
-        help="Number of samples to generate from the modified latents",
+        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
     )
     parser.add_argument(
         "--bag_of_words",
@@ -821,8 +725,8 @@ if __name__ == '__main__':
         type=str,
         default=None,
         help="Bags of words used for PPLM-BoW. "
-             "Either a BOW id (see list in code) or a filepath. "
-             "Multiple BoWs separated by ;",
+        "Either a BOW id (see list in code) or a filepath. "
+        "Multiple BoWs separated by ;",
     )
     parser.add_argument(
         "--discrim",
@@ -832,48 +736,36 @@ if __name__ == '__main__':
         choices=("clickbait", "sentiment", "toxicity", "generic"),
         help="Discriminator to use",
     )
-    parser.add_argument('--discrim_weights', type=str, default=None,
-                        help='Weights for the generic discriminator')
-    parser.add_argument('--discrim_meta', type=str, default=None,
-                        help='Meta information for the generic discriminator')
+    parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
     parser.add_argument(
-        "--class_label",
-        type=int,
-        default=-1,
-        help="Class label used for the discriminator",
+        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
+    )
+    parser.add_argument(
+        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
     )
     parser.add_argument("--length", type=int, default=100)
     parser.add_argument("--stepsize", type=float, default=0.02)
     parser.add_argument("--temperature", type=float, default=1.0)
     parser.add_argument("--top_k", type=int, default=10)
-    parser.add_argument(
-        "--sample", action="store_true",
-        help="Generate from end-of-text as prefix"
-    )
+    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
     parser.add_argument("--num_iterations", type=int, default=3)
     parser.add_argument("--grad_length", type=int, default=10000)
     parser.add_argument(
         "--window_length",
         type=int,
         default=0,
-        help="Length of past which is being optimized; "
-             "0 corresponds to infinite window length",
+        help="Length of past which is being optimized; " "0 corresponds to infinite window length",
     )
     parser.add_argument(
-        "--horizon_length",
-        type=int,
-        default=1,
-        help="Length of future to optimize over",
+        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
     )
-    parser.add_argument("--decay", action="store_true",
-                        help="whether to decay or not")
+    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
     parser.add_argument("--gamma", type=float, default=1.5)
     parser.add_argument("--gm_scale", type=float, default=0.9)
     parser.add_argument("--kl_scale", type=float, default=0.01)
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--no_cuda", action="store_true", help="no cuda")
-    parser.add_argument("--colorama", action="store_true",
-                        help="colors keywords")
+    parser.add_argument("--colorama", action="store_true", help="colors keywords")
 
     args = parser.parse_args()
     run_pplm_example(**vars(args))
diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py
index 3055139d8..14136c4c7 100644
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -1,19 +1,19 @@
 #! /usr/bin/env python3
 # coding=utf-8
 
-#Copyright (c) 2019 Uber Technologies, Inc.
+# Copyright (c) 2019 Uber Technologies, Inc.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import argparse
 import csv
@@ -42,26 +42,15 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
 max_length_seq = 100
 
 
-
-
 class Discriminator(torch.nn.Module):
     """Transformer encoder followed by a Classification Head"""
 
-    def __init__(
-            self,
-            class_size,
-            pretrained_model="gpt2-medium",
-            cached_mode=False,
-            device='cpu'
-    ):
+    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
         super(Discriminator, self).__init__()
         self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
         self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
         self.embed_size = self.encoder.transformer.config.hidden_size
-        self.classifier_head = ClassificationHead(
-            class_size=class_size,
-            embed_size=self.embed_size
-        )
+        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
         self.cached_mode = cached_mode
         self.device = device
 
@@ -74,14 +63,10 @@ class Discriminator(torch.nn.Module):
         self.classifier_head.train()
 
     def avg_representation(self, x):
-        mask = x.ne(0).unsqueeze(2).repeat(
-            1, 1, self.embed_size
-        ).float().to(self.device).detach()
+        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
         hidden, _ = self.encoder.transformer(x)
         masked_hidden = hidden * mask
-        avg_hidden = torch.sum(masked_hidden, dim=1) / (
-                torch.sum(mask, dim=1).detach() + EPSILON
-        )
+        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
         return avg_hidden
 
     def forward(self, x):
@@ -117,10 +102,7 @@ def collate_fn(data):
     def pad_sequences(sequences):
         lengths = [len(seq) for seq in sequences]
 
-        padded_sequences = torch.zeros(
-            len(sequences),
-            max(lengths)
-        ).long()  # padding value = 0
+        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0
 
         for i, seq in enumerate(sequences):
             end = lengths[i]
@@ -149,8 +131,7 @@ def cached_collate_fn(data):
     return x_batch, y_batch
 
 
-def train_epoch(data_loader, discriminator, optimizer,
-                epoch=0, log_interval=10, device='cpu'):
+def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
     samples_so_far = 0
     discriminator.train_custom()
     for batch_idx, (input_t, target_t) in enumerate(data_loader):
@@ -169,13 +150,15 @@ def train_epoch(data_loader, discriminator, optimizer,
             print(
                 "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                     epoch + 1,
-                    samples_so_far, len(data_loader.dataset),
-                    100 * samples_so_far / len(data_loader.dataset), loss.item()
+                    samples_so_far,
+                    len(data_loader.dataset),
+                    100 * samples_so_far / len(data_loader.dataset),
+                    loss.item(),
                 )
             )
 
 
-def evaluate_performance(data_loader, discriminator, device='cpu'):
+def evaluate_performance(data_loader, discriminator, device="cpu"):
     discriminator.eval()
     test_loss = 0
     correct = 0
@@ -194,13 +177,12 @@ def evaluate_performance(data_loader, discriminator, device='cpu'):
     print(
         "Performance on test set: "
         "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
-            test_loss, correct, len(data_loader.dataset),
-            100. * correct / len(data_loader.dataset)
+            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
         )
     )
 
 
-def predict(input_sentence, model, classes, cached=False, device='cpu'):
+def predict(input_sentence, model, classes, cached=False, device="cpu"):
     input_t = model.tokenizer.encode(input_sentence)
     input_t = torch.tensor([input_t], dtype=torch.long, device=device)
     if cached:
@@ -208,17 +190,14 @@ def predict(input_sentence, model, classes, cached=False, device='cpu'):
 
     log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
     print("Input sentence:", input_sentence)
-    print("Predictions:", ", ".join(
-        "{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
-        zip(classes, log_probs)
-    ))
+    print(
+        "Predictions:",
+        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
+    )
 
 
-def get_cached_data_loader(dataset, batch_size, discriminator,
-                           shuffle=False, device='cpu'):
-    data_loader = torch.utils.data.DataLoader(dataset=dataset,
-                                              batch_size=batch_size,
-                                              collate_fn=collate_fn)
+def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
+    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
 
     xs = []
     ys = []
@@ -231,50 +210,44 @@ def get_cached_data_loader(dataset, batch_size, discriminator,
             ys += y.cpu().numpy().tolist()
 
     data_loader = torch.utils.data.DataLoader(
-        dataset=Dataset(xs, ys),
-        batch_size=batch_size,
-        shuffle=shuffle,
-        collate_fn=cached_collate_fn)
+        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
+    )
 
     return data_loader
 
 
 def train_discriminator(
-        dataset, dataset_fp=None, pretrained_model="gpt2-medium",
-        epochs=10, batch_size=64, log_interval=10,
-        save_model=False, cached=False, no_cuda=False):
+    dataset,
+    dataset_fp=None,
+    pretrained_model="gpt2-medium",
+    epochs=10,
+    batch_size=64,
+    log_interval=10,
+    save_model=False,
+    cached=False,
+    no_cuda=False,
+):
     device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
 
     print("Preprocessing {} dataset...".format(dataset))
     start = time.time()
 
     if dataset == "SST":
-        idx2class = ["positive", "negative", "very positive", "very negative",
-                     "neutral"]
+        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         text = torchtext_data.Field()
         label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(
-            text,
-            label,
-            fine_grained=True,
-            train_subtrees=True,
-        )
+        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
 
         x = []
         y = []
         for i in trange(len(train_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(train_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
             seq = discriminator.tokenizer.encode(seq)
             seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
             x.append(seq)
@@ -284,9 +257,7 @@ def train_discriminator(
         test_x = []
         test_y = []
         for i in trange(len(test_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(
-                vars(test_data[i])["text"]
-            )
+            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
             seq = discriminator.tokenizer.encode(seq)
             seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
             test_x.append(seq)
@@ -306,10 +277,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
@@ -318,9 +286,7 @@ def train_discriminator(
                 try:
                     data.append(eval(line))
                 except:
-                    print("Error evaluating line {}: {}".format(
-                        i, line
-                    ))
+                    print("Error evaluating line {}: {}".format(i, line))
                     continue
         x = []
         y = []
@@ -331,27 +297,20 @@ def train_discriminator(
                     seq = discriminator.tokenizer.encode(d["text"])
 
                     if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                     else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                         continue
                     x.append(seq)
                     y.append(d["label"])
                 except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -366,10 +325,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         x = []
@@ -381,27 +337,20 @@ def train_discriminator(
                     seq = discriminator.tokenizer.encode(d["text"])
 
                     if len(seq) < max_length_seq:
-                        seq = torch.tensor(
-                            [50256] + seq, device=device, dtype=torch.long
-                        )
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
                     else:
-                        print("Line {} is longer than maximum length {}".format(
-                            i, max_length_seq
-                        ))
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                         continue
                     x.append(seq)
                     y.append(int(np.sum(d["label"]) > 0))
                 except:
-                    print("Error evaluating / tokenizing"
-                          " line {}, skipping it".format(i))
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset, [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -416,8 +365,7 @@ def train_discriminator(
         # class \t text
 
         if dataset_fp is None:
-            raise ValueError("When generic dataset is selected, "
-                             "dataset_fp needs to be specified aswell.")
+            raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")
 
         classes = set()
         with open(dataset_fp) as f:
@@ -430,10 +378,7 @@ def train_discriminator(
         class2idx = {c: i for i, c in enumerate(idx2class)}
 
         discriminator = Discriminator(
-            class_size=len(idx2class),
-            pretrained_model=pretrained_model,
-            cached_mode=cached,
-            device=device
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
         ).to(device)
 
         x = []
@@ -447,18 +392,11 @@ def train_discriminator(
 
                     try:
                         seq = discriminator.tokenizer.encode(text)
-                        if (len(seq) < max_length_seq):
-                            seq = torch.tensor(
-                                [50256] + seq,
-                                device=device,
-                                dtype=torch.long
-                            )
+                        if len(seq) < max_length_seq:
+                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
 
                         else:
-                            print(
-                                "Line {} is longer than maximum length {}".format(
-                                    i, max_length_seq
-                                ))
+                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
                             continue
 
                         x.append(seq)
@@ -471,10 +409,7 @@ def train_discriminator(
         full_dataset = Dataset(x, y)
         train_size = int(0.9 * len(full_dataset))
         test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(
-            full_dataset,
-            [train_size, test_size]
-        )
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
 
         discriminator_meta = {
             "class_size": len(idx2class),
@@ -485,9 +420,7 @@ def train_discriminator(
         }
 
     end = time.time()
-    print("Preprocessed {} data points".format(
-        len(train_dataset) + len(test_dataset))
-    )
+    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
     print("Data preprocessing took: {:.3f}s".format(end - start))
 
     if cached:
@@ -495,30 +428,21 @@ def train_discriminator(
 
         start = time.time()
 
-        train_loader = get_cached_data_loader(
-            train_dataset, batch_size, discriminator,
-            shuffle=True, device=device
-        )
+        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
 
-        test_loader = get_cached_data_loader(
-            test_dataset, batch_size, discriminator, device=device
-        )
+        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
 
         end = time.time()
         print("Building representation cache took: {:.3f}s".format(end - start))
 
     else:
-        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                                   batch_size=batch_size,
-                                                   shuffle=True,
-                                                   collate_fn=collate_fn)
-        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
-                                                  batch_size=batch_size,
-                                                  collate_fn=collate_fn)
+        train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
+        )
+        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
 
     if save_model:
-        with open("{}_classifier_head_meta.json".format(dataset),
-                  "w") as meta_file:
+        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
             json.dump(discriminator_meta, meta_file)
 
     optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
@@ -533,56 +457,61 @@ def train_discriminator(
             optimizer=optimizer,
             epoch=epoch,
             log_interval=log_interval,
-            device=device
-        )
-        evaluate_performance(
-            data_loader=test_loader,
-            discriminator=discriminator,
-            device=device
+            device=device,
         )
+        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
 
         end = time.time()
         print("Epoch took: {:.3f}s".format(end - start))
 
         print("\nExample prediction")
-        predict(example_sentence, discriminator, idx2class,
-                cached=cached, device=device)
+        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
 
         if save_model:
             # torch.save(discriminator.state_dict(),
             #           "{}_discriminator_{}.pt".format(
             #               args.dataset, epoch + 1
             #               ))
-            torch.save(discriminator.get_classifier().state_dict(),
-                       "{}_classifier_head_epoch_{}.pt".format(dataset,
-                                                               epoch + 1))
+            torch.save(
+                discriminator.get_classifier().state_dict(),
+                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
+            )
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Train a discriminator on top of GPT-2 representations")
-    parser.add_argument("--dataset", type=str, default="SST",
-                        choices=("SST", "clickbait", "toxic", "generic"),
-                        help="dataset to train the discriminator on."
-                             "In case of generic, the dataset is expected"
-                             "to be a TSBV file with structure: class \\t text")
-    parser.add_argument("--dataset_fp", type=str, default="",
-                        help="File path of the dataset to use. "
-                             "Needed only in case of generic datadset")
-    parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
-                        help="Pretrained model to use as encoder")
-    parser.add_argument("--epochs", type=int, default=10, metavar="N",
-                        help="Number of training epochs")
-    parser.add_argument("--batch_size", type=int, default=64, metavar="N",
-                        help="input batch size for training (default: 64)")
-    parser.add_argument("--log_interval", type=int, default=10, metavar="N",
-                        help="how many batches to wait before logging training status")
-    parser.add_argument("--save_model", action="store_true",
-                        help="whether to save the model")
-    parser.add_argument("--cached", action="store_true",
-                        help="whether to cache the input representations")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="use to turn off cuda")
+    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="SST",
+        choices=("SST", "clickbait", "toxic", "generic"),
+        help="dataset to train the discriminator on."
+        "In case of generic, the dataset is expected"
+        "to be a TSBV file with structure: class \\t text",
+    )
+    parser.add_argument(
+        "--dataset_fp",
+        type=str,
+        default="",
+        help="File path of the dataset to use. " "Needed only in case of generic datadset",
+    )
+    parser.add_argument(
+        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+    )
+    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
+    parser.add_argument(
+        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
+    )
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
+    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
+    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
     args = parser.parse_args()
 
     train_discriminator(**(vars(args)))
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index d1d05a107..6b4739d6b 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,10 +32,18 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformers import (WEIGHTS_NAME,
-                                  BertConfig, BertForSequenceClassification, BertTokenizer,
-                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+)
 
 from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
 
@@ -63,7 +71,9 @@ def print_2d_tensor(tensor):
             logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
 
 
-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+def compute_heads_importance(
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+):
     """ This method shows how to compute:
         - head attention entropy
         - head importance scores according to http://arxiv.org/abs/1905.10650
@@ -85,8 +95,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
         input_ids, input_mask, segment_ids, label_ids = batch
 
         # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
-        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        outputs = model(
+            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
+        )
+        loss, logits, all_attentions = (
+            outputs[0],
+            outputs[1],
+            outputs[-1],
+        )  # Loss and logits are the first, attention the last
         loss.backward()  # Backpropagate to populate the gradients in the head mask
 
         if compute_entropy:
@@ -113,15 +129,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     # Layerwise importance normalization
     if not args.dont_normalize_importance_by_layer:
         exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
         head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
 
     if not args.dont_normalize_global_importance:
         head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
 
     # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())
 
     logger.info("Attention entropies")
     print_2d_tensor(attn_entropy)
@@ -129,7 +145,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     print_2d_tensor(head_importance)
     logger.info("Head ranked by importance scores")
     head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
+        head_importance.numel(), device=args.device
+    )
     head_ranks = head_ranks.view_as(head_importance)
     print_2d_tensor(head_ranks)
 
@@ -150,9 +168,9 @@ def mask_heads(args, model, eval_dataloader):
 
     current_score = original_score
     while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone() # save current head mask
+        head_mask = new_head_mask.clone()  # save current head mask
         # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float('Inf')
+        head_importance[head_mask == 0.0] = float("Inf")
         current_heads_to_mask = head_importance.view(-1).sort()[1]
 
         if len(current_heads_to_mask) <= num_to_mask:
@@ -167,14 +185,21 @@ def mask_heads(args, model, eval_dataloader):
         print_2d_tensor(new_head_mask)
 
         # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        _, head_importance, preds, labels = compute_heads_importance(
+            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
+        )
         preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
         current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+        logger.info(
+            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            current_score,
+            new_head_mask.sum(),
+            new_head_mask.sum() / new_head_mask.numel() * 100,
+        )
 
     logger.info("Final head mask")
     print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
 
     return head_mask
 
@@ -186,8 +211,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     # Try pruning and test time speedup
     # Pruning is like masking but we actually remove the masked weights
     before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
+    )
     preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
     score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
     original_time = datetime.now() - before_time
@@ -199,73 +225,127 @@ def prune_heads(args, model, eval_dataloader, head_mask):
     pruned_num_params = sum(p.numel() for p in model.parameters())
 
     before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+    )
     preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
     score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
     new_time = datetime.now() - before_time
 
-    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info(
+        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
+        original_num_params,
+        pruned_num_params,
+        pruned_num_params / original_num_params * 100,
+    )
     logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
 
 
 def main():
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
-                            ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--data_subset", type=int, default=-1,
-                        help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true',
-                        help="Whether to overwrite data in output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-
-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
-                        help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true',
-                        help="Don't normalize all importance scores between 0 and 1")
-
-    parser.add_argument("--try_masking", action='store_true',
-                        help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float,
-                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float,
-                        help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str,
-                        help="Metric to use for head masking.")
-
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
+    )
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+
+    parser.add_argument(
+        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
+    )
+    parser.add_argument(
+        "--dont_normalize_global_importance",
+        action="store_true",
+        help="Don't normalize all importance scores between 0 and 1",
+    )
+
+    parser.add_argument(
+        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
+    )
+    parser.add_argument(
+        "--masking_threshold",
+        default=0.9,
+        type=float,
+        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
+    )
+    parser.add_argument(
+        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
+    )
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, sequences shorter padded.",
+    )
     parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
 
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -278,10 +358,10 @@ def main():
         torch.cuda.set_device(args.local_rank)
         args.device = torch.device("cuda", args.local_rank)
         args.n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
 
     # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
 
     # Set seeds
@@ -306,17 +386,23 @@ def main():
             args.model_type = key  # take the first match in model types
             break
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          output_attentions=True,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        output_attentions=True,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -324,14 +410,14 @@ def main():
     # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
     elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
     # Print/save training arguments
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
     logger.info("Training/evaluation parameters %s", args)
 
     # Prepare dataset for the GLUE task
@@ -341,11 +427,9 @@ def main():
     eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
 
-
     # Compute head entropy and importance score
     compute_heads_importance(args, model, eval_dataloader)
 
-
     # Try head masking (set heads to zero until the score goes under a threshole)
     # and head pruning (remove masked heads and see the effect on the network)
     if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
@@ -353,5 +437,5 @@ def main():
         prune_heads(args, model, eval_dataloader, head_mask)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 536d4a18f..e62ccf87c 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -33,9 +33,7 @@ from transformers import XLMWithLMHeadModel, XLMTokenizer
 
 
 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
 
@@ -71,6 +69,7 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 #
 # Functions to prepare models' input
 #
@@ -78,15 +77,11 @@ def set_seed(args):
 
 def prepare_ctrl_input(args, _, tokenizer, prompt_text):
     if args.temperature > 0.7:
-        logger.info(
-            "CTRL typically works better with lower temperatures (and lower top_k)."
-        )
+        logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
 
     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
     if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
-        logger.info(
-            "WARNING! You are not starting your generation from a control code so you won't get good results"
-        )
+        logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
     return prompt_text
 
 
@@ -102,11 +97,7 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text):
         else:
             language = None
             while language not in available_languages:
-                language = input(
-                    "Using XLM. Select language in "
-                    + str(list(available_languages))
-                    + " >>> "
-                )
+                language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
         # kwargs["language"] = tokenizer.lang2id[language]
 
     # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
@@ -148,17 +139,34 @@ def adjust_length_to_model(length, max_sequence_length):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
 
     parser.add_argument("--prompt", type=str, default="")
     parser.add_argument("--length", type=int, default=20)
     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
 
-    parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 1.0 has no effect, lower tend toward greedy sampling")
-    parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2")
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
+    )
     parser.add_argument("--k", type=int, default=0)
     parser.add_argument("--p", type=float, default=0.9)
 
@@ -169,9 +177,7 @@ def main():
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     args = parser.parse_args()
 
-    args.device = torch.device(
-        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-    )
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = torch.cuda.device_count()
 
     set_seed(args)
@@ -181,17 +187,13 @@ def main():
         args.model_type = args.model_type.lower()
         model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     except KeyError:
-        raise KeyError(
-            "the model {} you specified is not supported. You are welcome to add it and open a PR :)"
-        )
+        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
 
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
     model = model_class.from_pretrained(args.model_name_or_path)
     model.to(args.device)
 
-    args.length = adjust_length_to_model(
-        args.length, max_sequence_length=model.config.max_position_embeddings
-    )
+    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
     logger.info(args)
 
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
@@ -201,7 +203,7 @@ def main():
     if requires_preprocessing:
         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
         prompt_text = prepare_input(args, model, tokenizer, prompt_text)
-    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt')
+    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
 
     output_sequences = model.generate(
         input_ids=encoded_prompt,
@@ -212,7 +214,7 @@ def main():
         repetition_penalty=args.repetition_penalty,
     )
 
-    # Batch size == 1. to add more examples please use num_return_sequences > 1 
+    # Batch size == 1. to add more examples please use num_return_sequences > 1
     generated_sequence = output_sequences[0].tolist()
     text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
     text = text[: t.find(args.stop_token) if args.stop_token else None]
diff --git a/examples/run_glue.py b/examples/run_glue.py
index c143b6205..bbfd52ea3 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -26,8 +26,7 @@ import json
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,25 +36,30 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer,
-                                  XLMConfig, XLMForSequenceClassification,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer,
-                                  DistilBertConfig,
-                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer,
-                                  AlbertConfig,
-                                  AlbertForSequenceClassification, 
-                                  AlbertTokenizer,
-                                  XLMRobertaConfig,
-                                  XLMRobertaForSequenceClassification,
-                                  XLMRobertaTokenizer,
-                                )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
@@ -66,17 +70,22 @@ from transformers import glue_convert_examples_to_features as convert_examples_t
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
-    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
 }
 
 
@@ -104,20 +113,27 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
 
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -132,17 +148,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -152,7 +172,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -163,7 +183,9 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -176,16 +198,16 @@ def train(args, train_dataset, model, tokenizer):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -209,36 +231,40 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                             logs[eval_key] = value
 
                     loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                     learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                     logging_loss = tr_loss
 
                     for key, value in logs.items():
                         tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{'step': global_step}}))
+                    print(json.dumps({**logs, **{"step": global_step}}))
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -257,7 +283,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
     # Loop to handle MNLI double evaluation (matched, mis-matched)
     eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
 
     results = {}
     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -288,11 +314,11 @@ def evaluate(args, model, tokenizer, prefix=""):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert", "xlnet"] else None
+                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -300,10 +326,10 @@ def evaluate(args, model, tokenizer, prefix=""):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         if args.output_mode == "classification":
@@ -330,29 +356,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']:
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
             # HACK(label indices are swapped in RoBERTa pretrained model)
             label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -369,7 +402,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
     elif output_mode == "regression":
         all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
- 
+
     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
     return dataset
 
@@ -378,90 +411,149 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -473,16 +565,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -502,17 +602,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -521,14 +627,12 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
-
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -538,36 +642,39 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index b659e229b..60b99f29d 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -42,37 +42,55 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-                                  BertConfig, BertForMaskedLM, BertTokenizer,
-                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
-                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
-                                  CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    get_linear_schedule_with_warmup,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+)
 
 
 logger = logging.getLogger(__name__)
 
 
 MODEL_CLASSES = {
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
 }
 
 
 class TextDataset(Dataset):
-    def __init__(self, tokenizer, args, file_path='train', block_size=512):
+    def __init__(self, tokenizer, args, file_path="train", block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
+        cached_features_file = os.path.join(
+            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
+        )
 
         if os.path.exists(cached_features_file) and not args.overwrite_cache:
             logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, 'rb') as handle:
+            with open(cached_features_file, "rb") as handle:
                 self.examples = pickle.load(handle)
         else:
             logger.info("Creating features from dataset file at %s", directory)
@@ -83,14 +101,14 @@ class TextDataset(Dataset):
 
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
-            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
+            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
 
             logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, 'wb') as handle:
+            with open(cached_features_file, "wb") as handle:
                 pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
     def __len__(self):
@@ -101,7 +119,12 @@ class TextDataset(Dataset):
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
+    dataset = TextDataset(
+        tokenizer,
+        args,
+        file_path=args.eval_data_file if evaluate else args.train_data_file,
+        block_size=args.block_size,
+    )
     return dataset
 
 
@@ -120,7 +143,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
         return
 
     # Check if we should delete older checkpoint(s)
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
     if len(glob_checkpoints) <= args.save_total_limit:
         return
 
@@ -129,7 +152,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
         if use_mtime:
             ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
         else:
-            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
             if regex_match and regex_match.groups():
                 ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
 
@@ -147,7 +170,9 @@ def mask_tokens(inputs, tokenizer, args):
     labels = inputs.clone()
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
     probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+    ]
     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
     masked_indices = torch.bernoulli(probability_matrix).bool()
     labels[~masked_indices] = -100  # We only compute loss on masked tokens
@@ -181,19 +206,26 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -208,17 +240,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -228,7 +264,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -239,16 +275,18 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
 
-    model_to_resize = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
     model_to_resize.resize_token_embeddings(len(tokenizer))
 
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-            
+
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -285,31 +323,35 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = 'checkpoint'
+                    checkpoint_prefix = "checkpoint"
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
+                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
                     _rotate_checkpoints(args, checkpoint_prefix)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -365,9 +407,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_loss = eval_loss / nb_eval_steps
     perplexity = torch.exp(torch.tensor(eval_loss))
 
-    result = {
-        "perplexity": perplexity
-    }
+    result = {"perplexity": perplexity}
 
     output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
     with open(output_eval_file, "w") as writer:
@@ -383,107 +423,167 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_data_file", default=None, type=str, required=True,
-                        help="The input training data file (a text file).")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--eval_data_file", default=None, type=str,
-                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
-
-    parser.add_argument("--model_type", default="bert", type=str,
-                        help="The model architecture to be fine-tuned.")
-    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
-                        help="The model checkpoint for weights initialization.")
-
-    parser.add_argument("--mlm", action='store_true',
-                        help="Train with masked-language modeling loss instead of language modeling.")
-    parser.add_argument("--mlm_probability", type=float, default=0.15,
-                        help="Ratio of tokens to mask for masked language modeling loss")
-
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Optional pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
-    parser.add_argument("--block_size", default=-1, type=int,
-                        help="Optional input sequence length after tokenization."
-                             "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=1.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument('--save_total_limit', type=int, default=None,
-                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )
+
+    parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint for weights initialization.",
+    )
+
+    parser.add_argument(
+        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )
+
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
     if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
-        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
-                         "flag (masked language modeling).")
+        raise ValueError(
+            "BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
     if args.eval_data_file is None and args.do_eval:
-        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-                         "or remove the --do_eval argument.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -495,16 +595,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -514,18 +622,26 @@ def main():
         torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
 
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
     if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
+        args.block_size = (
+            tokenizer.max_len_single_sentence
+        )  # Our input block size will be the max possible for the model
     args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
     model.to(args.device)
 
     if args.local_rank == 0:
@@ -546,7 +662,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -556,35 +671,38 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 9d1ca7f30..bfa62cfb7 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -26,8 +26,7 @@ import random
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,34 +36,38 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer,
-                                  XLNetConfig, XLNetForMultipleChoice,
-                                  XLNetTokenizer, RobertaConfig,
-                                  RobertaForMultipleChoice, RobertaTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    XLNetConfig,
+    XLNetForMultipleChoice,
+    XLNetTokenizer,
+    RobertaConfig,
+    RobertaForMultipleChoice,
+    RobertaTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from utils_multiple_choice import (convert_examples_to_features, processors)
+from utils_multiple_choice import convert_examples_to_features, processors
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
-    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
 }
 
+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
 
 
 def simple_accuracy(preds, labels):
@@ -95,13 +98,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -115,17 +123,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -141,15 +153,19 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2]
+                if args.model_type in ["bert", "xlnet"]
+                else None,  # XLM don't use segment_ids
+                "labels": batch[3],
+            }
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -171,10 +187,12 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                         if results["eval_acc"] > best_dev_acc:
                             best_dev_acc = results["eval_acc"]
                             best_dev_loss = results["eval_loss"]
@@ -182,22 +200,33 @@ def train(args, train_dataset, model, tokenizer):
                             if args.do_test:
                                 results_test = evaluate(args, model, tokenizer, test=True)
                                 for key, value in results_test.items():
-                                    tb_writer.add_scalar('test_{}'.format(key), value, global_step)
-                                logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
-                    logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
+                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
+                                logger.info(
+                                    "test acc: %s, loss: %s, global steps: %s",
+                                    str(results_test["eval_acc"]),
+                                    str(results_test["eval_loss"]),
+                                    str(global_step),
+                                )
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logger.info(
+                        "Average loss: %s at global step: %s",
+                        str((tr_loss - logging_loss) / args.logging_steps),
+                        str(global_step),
+                    )
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -246,10 +275,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                          'labels':         batch[3]}
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2]
+                    if args.model_type in ["bert", "xlnet"]
+                    else None,  # XLM don't use segment_ids
+                    "labels": batch[3],
+                }
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -257,10 +290,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         preds = np.argmax(preds, axis=1)
@@ -273,8 +306,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
         with open(output_eval_file, "w") as writer:
             logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
             writer.write("model           =%s\n" % str(args.model_name_or_path))
-            writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
-                         (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
+            writer.write(
+                "total batch size=%d\n"
+                % (
+                    args.per_gpu_train_batch_size
+                    * args.gradient_accumulation_steps
+                    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
+                )
+            )
             writer.write("train num epochs=%d\n" % args.num_train_epochs)
             writer.write("fp16            =%s\n" % args.fp16)
             writer.write("max seq length  =%d\n" % args.max_seq_length)
@@ -291,17 +330,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
     processor = processors[task]()
     # Load data features from cache or dataset file
     if evaluate:
-        cached_mode = 'dev'
+        cached_mode = "dev"
     elif test:
-        cached_mode = 'test'
+        cached_mode = "test"
     else:
-        cached_mode = 'train'
+        cached_mode = "train"
     assert (evaluate == True and test == True) == False
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        cached_mode,
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            cached_mode,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
@@ -320,8 +363,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
             label_list,
             args.max_seq_length,
             tokenizer,
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -331,9 +374,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
     all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
 
     dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
@@ -344,91 +387,150 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -440,16 +542,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -468,17 +578,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -494,7 +610,6 @@ def main():
         global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -504,19 +619,20 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
@@ -524,17 +640,19 @@ def main():
             args.output_dir = args.model_name_or_path
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     if args.do_test and args.local_rank in [-1, 0]:
@@ -546,13 +664,13 @@ def main():
         #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
     if best_steps:
         logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 0fdaacf2a..48ac61b4f 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -43,9 +43,12 @@ from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLM
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig,
-                                                                   CamembertConfig, XLMRobertaConfig)),
-    ())
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
+    ),
+    (),
+)
 
 MODEL_CLASSES = {
     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
@@ -82,18 +85,24 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-         "weight_decay": args.weight_decay},
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -108,18 +117,21 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (
-                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -129,7 +141,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -140,7 +152,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -153,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
             if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
 
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
@@ -187,7 +201,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
@@ -200,15 +216,17 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -249,11 +267,11 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
         batch = tuple(t.to(args.device) for t in batch)
 
         with torch.no_grad():
-            inputs = {"input_ids": batch[0],
-                      "attention_mask": batch[1],
-                      "labels": batch[3]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
             if args.model_type != "distilbert":
-                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
             outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
 
@@ -287,7 +305,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
         "loss": eval_loss,
         "precision": precision_score(out_label_list, preds_list),
         "recall": recall_score(out_label_list, preds_list),
-        "f1": f1_score(out_label_list, preds_list)
+        "f1": f1_score(out_label_list, preds_list),
     }
 
     logger.info("***** Eval results %s *****", prefix)
@@ -302,29 +320,36 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
-        list(filter(None, args.model_name_or_path.split("/"))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}".format(
+            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
-                                                cls_token_at_end=bool(args.model_type in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args.model_type in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args.model_type in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args.max_seq_length,
+            tokenizer,
+            cls_token_at_end=bool(args.model_type in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args.model_type in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -346,95 +371,151 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--labels", default="", type=str,
-                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action="store_true",
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true",
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true",
-                        help="Whether to run predictions on the test set.")
-    parser.add_argument("--evaluate_during_training", action="store_true",
-                        help="Whether to run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action="store_true",
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action="store_true",
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action="store_true",
-                        help="Avoid using CUDA when available")
-    parser.add_argument("--overwrite_output_dir", action="store_true",
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--overwrite_cache", action="store_true",
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument("--seed", type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument("--fp16", action="store_true",
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument("--fp16_opt_level", type=str, default="O1",
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
+    parser.add_argument(
+        "--labels",
+        default="",
+        type=str,
+        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training",
+        action="store_true",
+        help="Whether to run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(
-            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
         raise ValueError(
             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir))
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -451,11 +532,19 @@ def main():
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-                        datefmt="%m/%d/%Y %H:%M:%S",
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -472,16 +561,22 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -505,7 +600,9 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
@@ -518,7 +615,9 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
@@ -565,4 +664,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 18a5a1c23..1580a31e8 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -17,7 +17,11 @@
 
 from __future__ import absolute_import, division, print_function
 from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
-from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_logits,
+    compute_predictions_log_probs,
+    squad_evaluate,
+)
 
 import argparse
 import logging
@@ -27,8 +31,7 @@ import glob
 import timeit
 import numpy as np
 import torch
-from torch.utils.data import (
-    DataLoader, RandomSampler, SequentialSampler, TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -38,32 +41,47 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
-                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
-                                  )
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
+    RobertaConfig,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+    AlbertConfig,
+    AlbertForQuestionAnswering,
+    AlbertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)),
+    (),
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
 }
 
 
@@ -85,49 +103,44 @@ def train(args, train_dataset, model, tokenizer):
         tb_writer = SummaryWriter()
 
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(
-        train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
         t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (
-            len(train_dataloader) // args.gradient_accumulation_steps) + 1
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
     else:
-        t_total = len(
-            train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(
-            nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(
-            nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
-    optimizer = AdamW(optimizer_grouped_parameters,
-                      lr=args.learning_rate, eps=args.adam_epsilon)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(
-            os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
             from apex import amp
         except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
-        model, optimizer = amp.initialize(
-            model, optimizer, opt_level=args.fp16_opt_level)
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
@@ -135,20 +148,22 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d",
-                args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
-    logger.info("  Gradient Accumulation steps = %d",
-                args.gradient_accumulation_steps)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
     global_step = 1
@@ -157,29 +172,25 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
-        epochs_trained = global_step // (len(train_dataloader) //
-                                         args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (
-            len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info(
-            "  Continuing training from checkpoint, will skip to saved global_step")
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
         logger.info("  Continuing training from epoch %d", epochs_trained)
         logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch",
-                    steps_trained_in_current_epoch)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(
-        args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     # Added here for reproductibility (even between python 2 and 3)
     set_seed(args)
 
     for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration",
-                              disable=args.local_rank not in [-1, 0])
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
 
             # Skip past any already trained steps if resuming training
@@ -191,18 +202,17 @@ def train(args, train_dataset, model, tokenizer):
             batch = tuple(t.to(args.device) for t in batch)
 
             inputs = {
-                'input_ids':       batch[0],
-                'attention_mask':  batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
-                'start_positions': batch[3],
-                'end_positions':   batch[4],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
             }
 
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                 if args.version_2_with_negative:
-                    inputs.update({'is_impossible': batch[7]})
+                    inputs.update({"is_impossible": batch[7]})
             outputs = model(**inputs)
             # model outputs are always tuple in transformers (see doc)
             loss = outputs[0]
@@ -221,11 +231,9 @@ def train(args, train_dataset, model, tokenizer):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(
-                        amp.master_params(optimizer), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), args.max_grad_norm)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -238,36 +246,27 @@ def train(args, train_dataset, model, tokenizer):
                     if args.local_rank == -1 and args.evaluate_during_training:
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar(
-                                'eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar(
-                        'lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar(
-                        'loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 # Save model checkpoint
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(
-                        args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
                     # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(
-                        model, 'module') else model
+                    model_to_save = model.module if hasattr(model, "module") else model
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(
-                        output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(
-                        output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(
-                        output_dir, 'scheduler.pt'))
-                    logger.info(
-                        "Saving optimizer and scheduler states to %s", output_dir)
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
                 epoch_iterator.close()
@@ -283,8 +282,7 @@ def train(args, train_dataset, model, tokenizer):
 
 
 def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(
-        args, tokenizer, evaluate=True, output_examples=True)
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
 
     if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
@@ -293,8 +291,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(
-        dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu evaluate
     if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
@@ -314,15 +311,15 @@ def evaluate(args, model, tokenizer, prefix=""):
 
         with torch.no_grad():
             inputs = {
-                'input_ids':      batch[0],
-                'attention_mask': batch[1],
-                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2],
             }
             example_indices = batch[3]
 
             # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
 
             outputs = model(**inputs)
 
@@ -342,53 +339,68 @@ def evaluate(args, model, tokenizer, prefix=""):
                 cls_logits = output[4]
 
                 result = SquadResult(
-                    unique_id, start_logits, end_logits,
+                    unique_id,
+                    start_logits,
+                    end_logits,
                     start_top_index=start_top_index,
                     end_top_index=end_top_index,
-                    cls_logits=cls_logits
+                    cls_logits=cls_logits,
                 )
 
             else:
                 start_logits, end_logits = output
-                result = SquadResult(
-                    unique_id, start_logits, end_logits
-                )
+                result = SquadResult(unique_id, start_logits, end_logits)
 
             all_results.append(result)
 
     evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)",
-                evalTime, evalTime / len(dataset))
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
 
     # Compute predictions
-    output_prediction_file = os.path.join(
-        args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(
-        args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
 
     if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(
-            args.output_dir, "null_odds_{}.json".format(prefix))
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
     else:
         output_null_log_odds_file = None
 
     # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ['xlnet', 'xlm']:
-        start_n_top = model.config.start_n_top if hasattr(
-            model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(
-            model, "config") else model.module.config.end_n_top
-
-        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
-                                                    args.max_answer_length, output_prediction_file,
-                                                    output_nbest_file, output_null_log_odds_file,
-                                                    start_n_top, end_n_top,
-                                                    args.version_2_with_negative, tokenizer, args.verbose_logging)
+    if args.model_type in ["xlnet", "xlm"]:
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
+        predictions = compute_predictions_log_probs(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            start_n_top,
+            end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold, tokenizer)
+        predictions = compute_predictions_logits(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+            tokenizer,
+        )
 
     # Compute the F1 and exact scores.
     results = squad_evaluate(examples, predictions)
@@ -402,16 +414,18 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length))
+    cached_features_file = os.path.join(
+        input_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
     )
 
     # Init features and dataset from cache if it exists
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s",
-                    cached_features_file)
+        logger.info("Loading features from cached file %s", cached_features_file)
         features_and_dataset = torch.load(cached_features_file)
         features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
     else:
@@ -421,16 +435,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             try:
                 import tensorflow_datasets as tfds
             except ImportError:
-                raise ImportError(
-                    "If not data_dir is specified, tensorflow_datasets needs to be installed.")
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
 
             if args.version_2_with_negative:
-                logger.warn(
-                    "tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
 
             tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(
-                tfds_examples, evaluate=evaluate)
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
         else:
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
             if evaluate:
@@ -445,15 +456,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             doc_stride=args.doc_stride,
             max_query_length=args.max_query_length,
             is_training=not evaluate,
-            return_dataset='pt',
+            return_dataset="pt",
             threads=args.threads,
         )
 
         if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s",
-                        cached_features_file)
-            torch.save({"features": features, "dataset": dataset},
-                       cached_features_file)
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)
 
     if args.local_rank == 0 and not evaluate:
         # Make sure only the first process in distributed training process the dataset, and the others will use the cache
@@ -468,140 +477,232 @@ def main():
     parser = argparse.ArgumentParser()
 
     # Required parameters
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     # Other parameters
-    parser.add_argument("--data_dir", default=None, type=str,
-                        help="The input data dir. Should contain the .json files for the task." +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--train_file", default=None, type=str,
-                        help="The input training file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
-                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-
-    parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features')
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .json files for the task."
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        help="The input training file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        help="The input evaluation file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+
+    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
         raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
-        ptvsd.enable_attach(
-            address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
 
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
-        device = torch.device(
-            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
         args.n_gpu = torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt='%m/%d/%Y %H:%M:%S',
-                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -613,16 +714,21 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool(
-                                            '.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         # Make sure only the first process in distributed training will download model & vocab
@@ -638,18 +744,16 @@ def main():
     if args.fp16:
         try:
             import apex
-            apex.amp.register_half_function(torch, 'einsum')
+
+            apex.amp.register_half_function(torch, "einsum")
         except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(
-            args, tokenizer, evaluate=False, output_examples=False)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s",
-                    global_step, tr_loss)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
@@ -661,18 +765,16 @@ def main():
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
         # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, 'module') else model
+        model_to_save = model.module if hasattr(model, "module") else model
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(
-            args.output_dir, force_download=True)
-        tokenizer = tokenizer_class.from_pretrained(
-            args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir, force_download=True)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -682,7 +784,10 @@ def main():
             logger.info("Loading checkpoints saved during training for evaluation")
             checkpoints = [args.output_dir]
             if args.eval_all_checkpoints:
-                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+                checkpoints = list(
+                    os.path.dirname(c)
+                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+                )
                 logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
         else:
             logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
@@ -692,17 +797,14 @@ def main():
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split(
-                '-')[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(
-                checkpoint, force_download=True)
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint, force_download=True)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v)
-                          for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 54282277d..74a6db34a 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,7 +1,14 @@
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
+from transformers import (
+    BertTokenizer,
+    TFBertForSequenceClassification,
+    BertConfig,
+    glue_convert_examples_to_features,
+    BertForSequenceClassification,
+    glue_processors,
+)
 
 # script parameters
 BATCH_SIZE = 32
@@ -16,7 +23,7 @@ if TASK == "sst-2":
     TFDS_TASK = "sst2"
 elif TASK == "sts-b":
     TFDS_TASK = "stsb"
-else: 
+else:
     TFDS_TASK = TASK
 
 num_labels = len(glue_processors[TASK]().get_labels())
@@ -27,29 +34,29 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
 
 # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
 config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)
 
 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
-train_examples = info.splits['train'].num_examples
+data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
+train_examples = info.splits["train"].num_examples
 
 # MNLI expects either validation_matched or validation_mismatched
-valid_examples = info.splits['validation'].num_examples
+valid_examples = info.splits["validation"].num_examples
 
 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)
 
 # MNLI expects either validation_matched or validation_mismatched
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
+valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
 opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
     # loss scaling is currently required when using mixed precision
-    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
 
 
 if num_labels == 1:
@@ -57,37 +64,42 @@ if num_labels == 1:
 else:
     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 model.compile(optimizer=opt, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
-train_steps = train_examples//BATCH_SIZE
-valid_steps = valid_examples//EVAL_BATCH_SIZE
+train_steps = train_examples // BATCH_SIZE
+valid_steps = valid_examples // EVAL_BATCH_SIZE
 
-history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
-                    validation_data=valid_dataset, validation_steps=valid_steps)
+history = model.fit(
+    train_dataset,
+    epochs=EPOCHS,
+    steps_per_epoch=train_steps,
+    validation_data=valid_dataset,
+    validation_steps=valid_steps,
+)
 
 # Save TF2 model
-os.makedirs('./save/', exist_ok=True)
-model.save_pretrained('./save/')
+os.makedirs("./save/", exist_ok=True)
+model.save_pretrained("./save/")
 
 if TASK == "mrpc":
     # Load the TensorFlow model in PyTorch for inspection
-    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # This is to demo the interoperability between the two frameworks, you don't have to
     # do this in real life (you can run the inference on the TF model).
-    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+    pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
 
     # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-    sentence_0 = 'This research was consistent with his findings.'
-    sentence_1 = 'His findings were compatible with this research.'
-    sentence_2 = 'His findings were not compatible with this research.'
-    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    sentence_0 = "This research was consistent with his findings."
+    sentence_1 = "His findings were compatible with this research."
+    sentence_2 = "His findings were not compatible with this research."
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
 
     del inputs_1["special_tokens_mask"]
     del inputs_2["special_tokens_mask"]
 
     pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
     pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+    print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index eb284f4c2..77850d1ab 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -21,189 +21,156 @@ from absl import app
 
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
-    ())
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
+)
 
 MODEL_CLASSES = {
     "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
     "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
 }
 
 
 flags.DEFINE_string(
-    "data_dir", None,
-    "The input data dir. Should contain the .conll files (or other data files) "
-    "for the task.")
+    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
+)
 
-flags.DEFINE_string(
-    "model_type", None,
-    "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
 
 flags.DEFINE_string(
-    "model_name_or_path", None,
-    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    "model_name_or_path",
+    None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+)
 
-flags.DEFINE_string(
-    "output_dir", None,
-    "The output directory where the model checkpoints will be written.")
+flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
 
 flags.DEFINE_string(
-    "labels", "",
-    "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
+    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
+)
 
-flags.DEFINE_string(
-    "config_name", "",
-    "Pretrained config name or path if not the same as model_name")
+flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
 
-flags.DEFINE_string(
-    "tokenizer_name", "",
-    "Pretrained tokenizer name or path if not the same as model_name")
+flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
 
-flags.DEFINE_string(
-    "cache_dir", "",
-    "Where do you want to store the pre-trained models downloaded from s3")
+flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
 
 flags.DEFINE_integer(
-    "max_seq_length", 128,
+    "max_seq_length",
+    128,
     "The maximum total input sentence length after tokenization. "
     "Sequences longer than this will be truncated, sequences shorter "
-    "will be padded.")
+    "will be padded.",
+)
 
 flags.DEFINE_string(
-    "tpu", None,
+    "tpu",
+    None,
     "The Cloud TPU to use for training. This should be either the name "
     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.")
+    "url.",
+)
 
-flags.DEFINE_integer(
-    "num_tpu_cores", 8,
-    "Total number of TPU cores to use.")
+flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
 
-flags.DEFINE_boolean(
-    "do_train", False,
-    "Whether to run training.")
+flags.DEFINE_boolean("do_train", False, "Whether to run training.")
 
-flags.DEFINE_boolean(
-    "do_eval", False,
-    "Whether to run eval on the dev set.")
+flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
 
-flags.DEFINE_boolean(
-    "do_predict", False,
-    "Whether to run predictions on the test set.")
+flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
 
 flags.DEFINE_boolean(
-    "evaluate_during_training", False,
-    "Whether to run evaluation during training at each logging step.")
+    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
+)
 
-flags.DEFINE_boolean(
-    "do_lower_case", False,
-    "Set this flag if you are using an uncased model.")
+flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
 
-flags.DEFINE_integer(
-    "per_device_train_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for training.")
+flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
 
-flags.DEFINE_integer(
-    "per_device_eval_batch_size", 8,
-    "Batch size per GPU/CPU/TPU for evaluation.")
+flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
 
 flags.DEFINE_integer(
-    "gradient_accumulation_steps", 1,
-    "Number of updates steps to accumulate before performing a backward/update pass.")
+    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
+)
 
-flags.DEFINE_float(
-    "learning_rate", 5e-5,
-    "The initial learning rate for Adam.")
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 
-flags.DEFINE_float(
-    "weight_decay", 0.0,
-    "Weight decay if we apply some.")
+flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
 
-flags.DEFINE_float(
-    "adam_epsilon", 1e-8,
-    "Epsilon for Adam optimizer.")
+flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
 
-flags.DEFINE_float(
-    "max_grad_norm", 1.0,
-    "Max gradient norm.")
+flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
 
-flags.DEFINE_integer(
-    "num_train_epochs", 3,
-    "Total number of training epochs to perform.")
+flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
 
 flags.DEFINE_integer(
-    "max_steps", -1,
-    "If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
+)
 
-flags.DEFINE_integer(
-    "warmup_steps", 0,
-    "Linear warmup over warmup_steps.")
+flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
 
-flags.DEFINE_integer(
-    "logging_steps", 50,
-    "Log every X updates steps.")
+flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
 
-flags.DEFINE_integer(
-    "save_steps", 50,
-    "Save checkpoint every X updates steps.")
+flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
 
 flags.DEFINE_boolean(
-    "eval_all_checkpoints", False,
-    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    "eval_all_checkpoints",
+    False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+)
 
-flags.DEFINE_boolean(
-    "no_cuda", False,
-    "Avoid using CUDA when available")
+flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
 
-flags.DEFINE_boolean(
-    "overwrite_output_dir", False,
-    "Overwrite the content of the output directory")
+flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
 
-flags.DEFINE_boolean(
-    "overwrite_cache", False,
-    "Overwrite the cached training and evaluation sets")
+flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
 
-flags.DEFINE_integer(
-    "seed", 42,
-    "random seed for initialization")
+flags.DEFINE_integer("seed", 42, "random seed for initialization")
 
-flags.DEFINE_boolean(
-    "fp16", False,
-    "Whether to use 16-bit (mixed) precision instead of 32-bit")
+flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
 
 flags.DEFINE_string(
-    "gpus", "0",
+    "gpus",
+    "0",
     "Comma separated list of gpus devices. If only one, switch to single "
-    "gpu strategy, if None takes all the gpus available.")
+    "gpu strategy, if None takes all the gpus available.",
+)
 
 
-def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
-    if args['max_steps'] > 0:
-        num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
-        args['num_train_epochs'] = 1
+def train(
+    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
+):
+    if args["max_steps"] > 0:
+        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
+        args["num_train_epochs"] = 1
     else:
-        num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
+        num_train_steps = (
+            math.ceil(num_train_examples / train_batch_size)
+            // args["gradient_accumulation_steps"]
+            * args["num_train_epochs"]
+        )
 
     writer = tf.summary.create_file_writer("/tmp/mylogs")
 
     with strategy.scope():
         loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-        optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
+        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
 
-        if args['fp16']:
-            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+        if args["fp16"]:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
 
-        loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
+        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
         gradient_accumulator = GradientAccumulator()
-        
+
     logging.info("***** Running training *****")
     logging.info("  Num examples = %d", num_train_examples)
-    logging.info("  Num Epochs = %d", args['num_train_epochs'])
-    logging.info("  Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
-    logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                train_batch_size * args['gradient_accumulation_steps'])
-    logging.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
+    logging.info("  Num Epochs = %d", args["num_train_epochs"])
+    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
+    logging.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        train_batch_size * args["gradient_accumulation_steps"],
+    )
+    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
     logging.info("  Total training steps = %d", num_train_steps)
 
     model.summary()
@@ -214,26 +181,28 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
 
         for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
             if gradient is not None:
-                scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
+                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
                 grads_and_vars.append((scaled_gradient, variable))
             else:
                 grads_and_vars.append((gradient, variable))
 
-        optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
+        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
         gradient_accumulator.reset()
 
     @tf.function
     def train_step(train_features, train_labels):
         def step_fn(train_features, train_labels):
-            inputs = {'attention_mask': train_features['input_mask'], 'training': True}
+            inputs = {"attention_mask": train_features["input_mask"], "training": True}
 
-            if args['model_type'] != "distilbert":
-                inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+            if args["model_type"] != "distilbert":
+                inputs["token_type_ids"] = (
+                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+                )
 
             with tf.GradientTape() as tape:
-                logits = model(train_features['input_ids'], **inputs)[0]
+                logits = model(train_features["input_ids"], **inputs)[0]
                 logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features['input_mask'], (-1,))
+                active_loss = tf.reshape(train_features["input_mask"], (-1,))
                 active_logits = tf.boolean_mask(logits, active_loss)
                 train_labels = tf.reshape(train_labels, (-1,))
                 active_labels = tf.boolean_mask(train_labels, active_loss)
@@ -251,34 +220,40 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
         return mean_loss
 
     current_time = datetime.datetime.now()
-    train_iterator = master_bar(range(args['num_train_epochs']))
+    train_iterator = master_bar(range(args["num_train_epochs"]))
     global_step = 0
     logging_loss = 0.0
 
     for epoch in train_iterator:
-        epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
+        epoch_iterator = progress_bar(
+            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
+        )
         step = 1
 
         with strategy.scope():
             for train_features, train_labels in epoch_iterator:
                 loss = train_step(train_features, train_labels)
 
-                if step % args['gradient_accumulation_steps'] == 0:
+                if step % args["gradient_accumulation_steps"] == 0:
                     strategy.experimental_run_v2(apply_gradients)
 
                     loss_metric(loss)
 
                     global_step += 1
 
-                    if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
+                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                         # Log metrics
-                        if args['n_device'] == 1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
-                            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                        if (
+                            args["n_device"] == 1 and args["evaluate_during_training"]
+                        ):  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(
+                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+                            )
                             report = metrics.classification_report(y_true, y_pred, digits=4)
-                            
+
                             logging.info("Eval at step " + str(global_step) + "\n" + report)
                             logging.info("eval_loss: " + str(eval_loss))
-                            
+
                             precision = metrics.precision_score(y_true, y_pred)
                             recall = metrics.recall_score(y_true, y_pred)
                             f1 = metrics.f1_score(y_true, y_pred)
@@ -288,33 +263,35 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
                                 tf.summary.scalar("precision", precision, global_step)
                                 tf.summary.scalar("recall", recall, global_step)
                                 tf.summary.scalar("f1", f1, global_step)
-                        
+
                         lr = optimizer.learning_rate
                         learning_rate = lr(step)
 
                         with writer.as_default():
                             tf.summary.scalar("lr", learning_rate, global_step)
-                            tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
-                        
+                            tf.summary.scalar(
+                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
+                            )
+
                         logging_loss = loss_metric.result()
 
                     with writer.as_default():
                         tf.summary.scalar("loss", loss_metric.result(), step=step)
 
-                    if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
+                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                         # Save model checkpoint
-                        output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
+                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
 
                         if not os.path.exists(output_dir):
                             os.makedirs(output_dir)
-                        
+
                         model.save_pretrained(output_dir)
                         logging.info("Saving model checkpoint to %s", output_dir)
-                
-                train_iterator.child.comment = f'loss : {loss_metric.result()}'
+
+                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                 step += 1
 
-        train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
+        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
 
         loss_metric.reset_states()
 
@@ -322,13 +299,15 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
 
 
 def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
-    eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-    eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
+    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+    eval_dataset, size = load_and_cache_examples(
+        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
+    )
     eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
     preds = None
     num_eval_steps = math.ceil(size / eval_batch_size)
     master = master_bar(range(1))
-    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
     loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
     loss = 0.0
 
@@ -337,15 +316,17 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
     logging.info("  Batch size = %d", eval_batch_size)
 
     for eval_features, eval_labels in eval_iterator:
-        inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
+        inputs = {"attention_mask": eval_features["input_mask"], "training": False}
 
-        if args['model_type'] != "distilbert":
-            inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+        if args["model_type"] != "distilbert":
+            inputs["token_type_ids"] = (
+                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+            )
 
         with strategy.scope():
-            logits = model(eval_features['input_ids'], **inputs)[0]
+            logits = model(eval_features["input_ids"], **inputs)[0]
             tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features['input_mask'], (-1,))
+            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
             active_logits = tf.boolean_mask(tmp_logits, active_loss)
             tmp_eval_labels = tf.reshape(eval_labels, (-1,))
             active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
@@ -384,11 +365,11 @@ def load_cache(cached_file, max_seq_length):
     def _decode_record(record):
         example = tf.io.parse_single_example(record, name_to_features)
         features = {}
-        features['input_ids'] = example['input_ids']
-        features['input_mask'] = example['input_mask']
-        features['segment_ids'] = example['segment_ids']
+        features["input_ids"] = example["input_ids"]
+        features["input_mask"] = example["input_mask"]
+        features["segment_ids"] = example["segment_ids"]
 
-        return features, example['label_ids']
+        return features, example["label_ids"]
 
     d = tf.data.TFRecordDataset(cached_file)
     d = d.map(_decode_record, num_parallel_calls=4)
@@ -422,39 +403,46 @@ def save_cache(features, cached_features_file):
 
 
 def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
-    drop_remainder = True if args['tpu'] or mode == 'train' else False
+    drop_remainder = True if args["tpu"] or mode == "train" else False
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
-        list(filter(None, args['model_name_or_path'].split("/"))).pop(),
-        str(args['max_seq_length'])))
-    if os.path.exists(cached_features_file) and not args['overwrite_cache']:
+    cached_features_file = os.path.join(
+        args["data_dir"],
+        "cached_{}_{}_{}.tf_record".format(
+            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
         logging.info("Loading features from cached file %s", cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
     else:
-        logging.info("Creating features from dataset file at %s", args['data_dir'])
-        examples = read_examples_from_file(args['data_dir'], mode)
-        features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
-                                                cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
-                                                # xlnet has a cls token at the end
-                                                cls_token=tokenizer.cls_token,
-                                                cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
-                                                sep_token=tokenizer.sep_token,
-                                                sep_token_extra=bool(args['model_type'] in ["roberta"]),
-                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                                                pad_on_left=bool(args['model_type'] in ["xlnet"]),
-                                                # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
-                                                pad_token_label_id=pad_token_label_id
-                                                )
+        logging.info("Creating features from dataset file at %s", args["data_dir"])
+        examples = read_examples_from_file(args["data_dir"], mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args["max_seq_length"],
+            tokenizer,
+            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args["model_type"] in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args["model_type"] in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
         logging.info("Saving features into cached file %s", cached_features_file)
         save_cache(features, cached_features_file)
-        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
 
-    if mode == 'train':
+    if mode == "train":
         dataset = dataset.repeat()
-        dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
+        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
 
     dataset = dataset.batch(batch_size, drop_remainder)
     dataset = dataset.prefetch(buffer_size=batch_size)
@@ -466,98 +454,134 @@ def main(_):
     logging.set_verbosity(logging.INFO)
     args = flags.FLAGS.flag_values_dict()
 
-    if os.path.exists(args['output_dir']) and os.listdir(
-            args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
+    if (
+        os.path.exists(args["output_dir"])
+        and os.listdir(args["output_dir"])
+        and args["do_train"]
+        and not args["overwrite_output_dir"]
+    ):
         raise ValueError(
             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args['output_dir']))
+                args["output_dir"]
+            )
+        )
 
-    if args['fp16']:
+    if args["fp16"]:
         tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
 
-    if args['tpu']:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
+    if args["tpu"]:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
         tf.config.experimental_connect_to_cluster(resolver)
         tf.tpu.experimental.initialize_tpu_system(resolver)
         strategy = tf.distribute.experimental.TPUStrategy(resolver)
-        args['n_device'] = args['num_tpu_cores']
-    elif len(args['gpus'].split(',')) > 1:
-        args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
-    elif args['no_cuda']:
-        args['n_device'] = 1
+        args["n_device"] = args["num_tpu_cores"]
+    elif len(args["gpus"].split(",")) > 1:
+        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+    elif args["no_cuda"]:
+        args["n_device"] = 1
         strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
     else:
-        args['n_device'] = len(args['gpus'].split(','))
-        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
+        args["n_device"] = len(args["gpus"].split(","))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
 
-    logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
-                   args['n_device'], bool(args['n_device'] > 1), args['fp16'])
+    logging.warning(
+        "n_device: %s, distributed training: %s, 16-bits training: %s",
+        args["n_device"],
+        bool(args["n_device"] > 1),
+        args["fp16"],
+    )
 
-    labels = get_labels(args['labels'])
+    labels = get_labels(args["labels"])
     num_labels = len(labels) + 1
     pad_token_label_id = 0
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
-    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
-                                          num_labels=num_labels,
-                                          cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
+    config = config_class.from_pretrained(
+        args["config_name"] if args["config_name"] else args["model_name_or_path"],
+        num_labels=num_labels,
+        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+    )
 
     logging.info("Training/evaluation parameters %s", args)
 
     # Training
-    if args['do_train']:
-        tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
-                                                    do_lower_case=args['do_lower_case'],
-                                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+    if args["do_train"]:
+        tokenizer = tokenizer_class.from_pretrained(
+            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
+            do_lower_case=args["do_lower_case"],
+            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+        )
 
         with strategy.scope():
-            model = model_class.from_pretrained(args['model_name_or_path'],
-                                                from_pt=bool(".bin" in args['model_name_or_path']),
-                                                config=config,
-                                                cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+            model = model_class.from_pretrained(
+                args["model_name_or_path"],
+                from_pt=bool(".bin" in args["model_name_or_path"]),
+                config=config,
+                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+            )
             model.layers[-1].activation = tf.keras.activations.softmax
 
-        train_batch_size = args['per_device_train_batch_size'] * args['n_device']
-        train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
+        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
+        train_dataset, num_train_examples = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
+        )
         train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-        train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
-
-        if not os.path.exists(args['output_dir']):
-            os.makedirs(args['output_dir'])
-
-        logging.info("Saving model to %s", args['output_dir'])
-
-        model.save_pretrained(args['output_dir'])
-        tokenizer.save_pretrained(args['output_dir'])
+        train(
+            args,
+            strategy,
+            train_dataset,
+            tokenizer,
+            model,
+            num_train_examples,
+            labels,
+            train_batch_size,
+            pad_token_label_id,
+        )
+
+        if not os.path.exists(args["output_dir"]):
+            os.makedirs(args["output_dir"])
+
+        logging.info("Saving model to %s", args["output_dir"])
+
+        model.save_pretrained(args["output_dir"])
+        tokenizer.save_pretrained(args["output_dir"])
 
     # Evaluation
-    if args['do_eval']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+    if args["do_eval"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
         checkpoints = []
         results = []
 
-        if args['eval_all_checkpoints']:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
-        
+        if args["eval_all_checkpoints"]:
+            checkpoints = list(
+                os.path.dirname(c)
+                for c in sorted(
+                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
+                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
+                )
+            )
+
         logging.info("Evaluate the following checkpoints: %s", checkpoints)
 
         if len(checkpoints) == 0:
-            checkpoints.append(args['output_dir'])
-        
+            checkpoints.append(args["output_dir"])
+
         for checkpoint in checkpoints:
             global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
 
             with strategy.scope():
                 model = model_class.from_pretrained(checkpoint)
 
-            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+            y_true, y_pred, eval_loss = evaluate(
+                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+            )
             report = metrics.classification_report(y_true, y_pred, digits=4)
 
             if global_step:
                 results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
 
-        output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
-        
+        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
+
         with tf.io.gfile.GFile(output_eval_file, "w") as writer:
             for res in results:
                 for key, val in res.items():
@@ -572,26 +596,28 @@ def main(_):
                         writer.write(report)
                         writer.write("\n")
 
-    if args['do_predict']:
-        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
-        model = model_class.from_pretrained(args['output_dir'])
-        eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
-        predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
+    if args["do_predict"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
+        model = model_class.from_pretrained(args["output_dir"])
+        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+        predict_dataset, _ = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
+        )
         y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
-        output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
+        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
+        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
         report = metrics.classification_report(y_true, y_pred, digits=4)
 
         with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
             report = metrics.classification_report(y_true, y_pred, digits=4)
-            
+
             logging.info("\n" + report)
-            
+
             writer.write(report)
             writer.write("\n\nloss = " + str(pred_loss))
 
         with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
-            with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
+            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
                 example_id = 0
 
                 for line in f:
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index 74bf295b6..9faba294d 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -26,8 +26,7 @@ import random
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -37,10 +36,18 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, 
-                          BertConfig, BertForSequenceClassification, BertTokenizer,
-                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
@@ -52,12 +59,14 @@ from transformers import glue_convert_examples_to_features as convert_examples_t
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
 }
 
 
@@ -85,19 +94,26 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
 
     # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
 
     if args.fp16:
         try:
@@ -112,17 +128,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -132,7 +152,7 @@ def train(args, train_dataset, model, tokenizer):
     # Check if continuing training from a checkpoint
     if os.path.exists(args.model_name_or_path):
         # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
 
@@ -143,7 +163,9 @@ def train(args, train_dataset, model, tokenizer):
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
@@ -155,16 +177,16 @@ def train(args, train_dataset, model, tokenizer):
 
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert"] else None
+                )  # XLM and DistilBERT don't use segment_ids
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -188,28 +210,32 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     tokenizer.save_pretrained(output_dir)
 
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -258,11 +284,11 @@ def evaluate(args, model, tokenizer, prefix=""):
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert"] else None
+                    )  # XLM and DistilBERT don't use segment_ids
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
@@ -270,16 +296,16 @@ def evaluate(args, model, tokenizer, prefix=""):
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
             else:
                 preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
 
         eval_loss = eval_loss / nb_eval_steps
         if args.output_mode == "classification":
             preds = np.argmax(preds, axis=1)
         else:
-            raise ValueError('No other `output_mode` for XNLI.')
+            raise ValueError("No other `output_mode` for XNLI.")
         result = compute_metrics(eval_task, preds, out_label_ids)
         results.update(result)
 
@@ -300,27 +326,34 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task](language=args.language, train_language=args.train_language)
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
-        'test' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task),
-        str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}_{}".format(
+            "test" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+            str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=False,
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=0,
+        examples = (
+            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -336,7 +369,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     if output_mode == "classification":
         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
     else:
-        raise ValueError('No other `output_mode` for XNLI.')
+        raise ValueError("No other `output_mode` for XNLI.")
 
     dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
     return dataset
@@ -346,92 +379,152 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--language", default=None, type=str, required=True,
-                        help="Evaluation language. Also train language if `train_language` is set to None.")
-    parser.add_argument("--train_language", default=None, type=str,
-                        help="Train language if is different of the evaluation language.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--language",
+        default=None,
+        type=str,
+        required=True,
+        help="Evaluation language. Also train language if `train_language` is set to None.",
+    )
+    parser.add_argument(
+        "--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the test set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -443,22 +536,30 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
 
     # Prepare XNLI task
-    args.task_name = 'xnli'
+    args.task_name = "xnli"
     if args.task_name not in processors:
         raise ValueError("Task not found: %s" % (args.task_name))
     processor = processors[args.task_name](language=args.language, train_language=args.train_language)
@@ -472,17 +573,23 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,
-                                          finetuning_task=args.task_name,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -491,14 +598,12 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
-
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -508,36 +613,39 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
-
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
             results.update(result)
 
     return results
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
index 33b17bfb6..d32e6fc06 100644
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -34,12 +34,30 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
+SAMPLE_TEXT = "Hello world! cécé herlolip"
 
 
 BertAbsConfig = namedtuple(
     "BertAbsConfig",
-    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+    [
+        "temp_dir",
+        "large",
+        "use_bert_emb",
+        "finetune_bert",
+        "encoder",
+        "share_emb",
+        "max_pos",
+        "enc_layers",
+        "enc_hidden_size",
+        "enc_heads",
+        "enc_ff_size",
+        "enc_dropout",
+        "dec_layers",
+        "dec_hidden_size",
+        "dec_heads",
+        "dec_ff_size",
+        "dec_dropout",
+    ],
 )
 
 
@@ -119,7 +137,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
     output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
     output_original_generator = original.generator(output_original_model)
 
-    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_model = new_model(
+        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
+    )[0]
     output_converted_generator = new_model.generator(output_converted_model)
 
     maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
@@ -136,28 +156,21 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
     # The model has been saved with torch.save(model) and this is bound to the exact
     # directory structure. We save the state_dict instead.
     logging.info("saving the model's state dictionary")
-    torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
+    torch.save(
+        new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
+    )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--bertabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
     )
     args = parser.parse_args()
 
     convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
     )
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 5bf1599ad..d4d8c6648 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -56,40 +56,22 @@ class BertAbs(BertAbsPreTrainedModel):
         load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
         if load_bert_pretrained_extractive:
             self.bert.model.load_state_dict(
-                dict(
-                    [
-                        (n[11:], p)
-                        for n, p in bert_extractive_checkpoint.items()
-                        if n.startswith("bert.model")
-                    ]
-                ),
+                dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
                 strict=True,
             )
 
         self.vocab_size = self.bert.model.config.vocab_size
 
         if args.max_pos > 512:
-            my_pos_embeddings = nn.Embedding(
-                args.max_pos, self.bert.model.config.hidden_size
-            )
-            my_pos_embeddings.weight.data[
-                :512
-            ] = self.bert.model.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[
-                512:
-            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
                 None, :
-            ].repeat(
-                args.max_pos - 512, 1
-            )
+            ].repeat(args.max_pos - 512, 1)
             self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        tgt_embeddings = nn.Embedding(
-            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
-        )
+        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
 
-        tgt_embeddings.weight = copy.deepcopy(
-            self.bert.model.embeddings.word_embeddings.weight
-        )
+        tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
 
         self.decoder = TransformerDecoder(
             self.args.dec_layers,
@@ -102,9 +84,7 @@ class BertAbs(BertAbsPreTrainedModel):
         )
 
         gen_func = nn.LogSoftmax(dim=-1)
-        self.generator = nn.Sequential(
-            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
-        )
+        self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
         self.generator[0].weight = self.decoder.embeddings.weight
 
         load_from_checkpoints = False if checkpoint is None else True
@@ -127,25 +107,14 @@ class BertAbs(BertAbsPreTrainedModel):
                 p.data.zero_()
 
     def forward(
-        self,
-        encoder_input_ids,
-        decoder_input_ids,
-        token_type_ids,
-        encoder_attention_mask,
-        decoder_attention_mask,
+        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
     ):
         encoder_output = self.bert(
-            input_ids=encoder_input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
         )
         encoder_hidden_states = encoder_output[0]
-        dec_state = self.decoder.init_decoder_state(
-            encoder_input_ids, encoder_hidden_states
-        )
-        decoder_outputs, _ = self.decoder(
-            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
-        )
+        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
+        decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
         return decoder_outputs
 
 
@@ -162,10 +131,7 @@ class Bert(nn.Module):
         self.eval()
         with torch.no_grad():
             encoder_outputs, _ = self.model(
-                input_ids,
-                token_type_ids=token_type_ids,
-                attention_mask=attention_mask,
-                **kwargs
+                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
             )
         return encoder_outputs
 
@@ -196,10 +162,7 @@ class TransformerDecoder(nn.Module):
 
         # Build TransformerDecoder.
         self.transformer_layers = nn.ModuleList(
-            [
-                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
-                for _ in range(num_layers)
-            ]
+            [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
         )
 
         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
@@ -236,20 +199,14 @@ class TransformerDecoder(nn.Module):
         # Decoder padding mask
         tgt_words = tgt
         tgt_batch, tgt_len = tgt_words.size()
-        tgt_pad_mask = (
-            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
-        )
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
 
         # Encoder padding mask
         if memory_mask is not None:
             src_len = memory_mask.size(-1)
             src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
         else:
-            src_pad_mask = (
-                src_words.data.eq(padding_idx)
-                .unsqueeze(1)
-                .expand(src_batch, tgt_len, src_len)
-            )
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)
 
         # Pass through the embeddings
         emb = self.embeddings(input_ids)
@@ -271,9 +228,7 @@ class TransformerDecoder(nn.Module):
                 src_pad_mask,
                 tgt_pad_mask,
                 previous_input=prev_layer_input,
-                layer_cache=state.cache["layer_{}".format(i)]
-                if state.cache is not None
-                else None,
+                layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
                 step=step,
             )
             if state.cache is None:
@@ -303,9 +258,7 @@ class PositionalEncoding(nn.Module):
     def __init__(self, dropout, dim, max_len=5000):
         pe = torch.zeros(max_len, dim)
         position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp(
-            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
-        )
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
         pe[:, 0::2] = torch.sin(position.float() * div_term)
         pe[:, 1::2] = torch.cos(position.float() * div_term)
         pe = pe.unsqueeze(0)
@@ -356,14 +309,7 @@ class TransformerDecoderLayer(nn.Module):
         self.register_buffer("mask", mask)
 
     def forward(
-        self,
-        inputs,
-        memory_bank,
-        src_pad_mask,
-        tgt_pad_mask,
-        previous_input=None,
-        layer_cache=None,
-        step=None,
+        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
     ):
         """
         Args:
@@ -380,34 +326,20 @@ class TransformerDecoderLayer(nn.Module):
             * all_input `[batch_size x current_step x model_dim]`
 
         """
-        dec_mask = torch.gt(
-            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
-        )
+        dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
         input_norm = self.layer_norm_1(inputs)
         all_input = input_norm
         if previous_input is not None:
             all_input = torch.cat((previous_input, input_norm), dim=1)
             dec_mask = None
 
-        query = self.self_attn(
-            all_input,
-            all_input,
-            input_norm,
-            mask=dec_mask,
-            layer_cache=layer_cache,
-            type="self",
-        )
+        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
 
         query = self.drop(query) + inputs
 
         query_norm = self.layer_norm_2(query)
         mid = self.context_attn(
-            memory_bank,
-            memory_bank,
-            query_norm,
-            mask=src_pad_mask,
-            layer_cache=layer_cache,
-            type="context",
+            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
         )
         output = self.feed_forward(self.drop(mid) + query)
 
@@ -492,14 +424,7 @@ class MultiHeadedAttention(nn.Module):
             self.final_linear = nn.Linear(model_dim, model_dim)
 
     def forward(
-        self,
-        key,
-        value,
-        query,
-        mask=None,
-        layer_cache=None,
-        type=None,
-        predefined_graph_1=None,
+        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
     ):
         """
         Compute the context vector and the attention vectors.
@@ -531,11 +456,7 @@ class MultiHeadedAttention(nn.Module):
 
         def unshape(x):
             """  compute context """
-            return (
-                x.transpose(1, 2)
-                .contiguous()
-                .view(batch_size, -1, head_count * dim_per_head)
-            )
+            return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
 
         # 1) Project key, value, and query.
         if layer_cache is not None:
@@ -554,9 +475,7 @@ class MultiHeadedAttention(nn.Module):
                     if layer_cache["self_keys"] is not None:
                         key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
                     if layer_cache["self_values"] is not None:
-                        value = torch.cat(
-                            (layer_cache["self_values"].to(device), value), dim=2
-                        )
+                        value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
                     layer_cache["self_keys"] = key
                     layer_cache["self_values"] = value
             elif type == "context":
@@ -637,13 +556,9 @@ class DecoderState(object):
             sizes = e.size()
             br = sizes[1]
             if len(sizes) == 3:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
-                    :, :, idx
-                ]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
             else:
-                sent_states = e.view(
-                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
-                )[:, :, idx]
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]
 
             sent_states.data.copy_(sent_states.data.index_select(1, positions))
 
@@ -716,11 +631,7 @@ class TransformerDecoderState(DecoderState):
 
 
 def gelu(x):
-    return (
-        0.5
-        * x
-        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    )
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
 class PositionwiseFeedForward(nn.Module):
@@ -758,9 +669,7 @@ class PositionwiseFeedForward(nn.Module):
 def build_predictor(args, tokenizer, symbols, model, logger=None):
     # we should be able to refactor the global scorer a lot
     scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
-    translator = Translator(
-        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
-    )
+    translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
     return translator
 
 
@@ -891,9 +800,7 @@ class Translator(object):
            Shouldn't need the original dataset.
         """
         with torch.no_grad():
-            return self._fast_translate_batch(
-                batch, self.max_length, min_length=self.min_length
-            )
+            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
 
     # Where the beam search lives
     # I have no idea why it is being called from the method above
@@ -912,26 +819,18 @@ class Translator(object):
         mask_src = batch.mask_src
 
         src_features = self.model.bert(src, segs, mask_src)
-        dec_states = self.model.decoder.init_decoder_state(
-            src, src_features, with_cache=True
-        )
+        dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
         device = src_features.device
 
         # Tile states and memory beam_size times.
         dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
         src_features = tile(src_features, beam_size, dim=0)
         batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(
-            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
-        )
-        alive_seq = torch.full(
-            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
-        )
+        beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)
 
         # Give full probability to the first beam on the first step.
-        topk_log_probs = torch.tensor(
-            [0.0] + [float("-inf")] * (beam_size - 1), device=device
-        ).repeat(batch_size)
+        topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)
 
         # Structure that holds finished hypotheses.
         hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
@@ -948,9 +847,7 @@ class Translator(object):
             # Decoder forward.
             decoder_input = decoder_input.transpose(0, 1)
 
-            dec_out, dec_states = self.model.decoder(
-                decoder_input, src_features, dec_states, step=step
-            )
+            dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)
 
             # Generator forward.
             log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
@@ -978,10 +875,7 @@ class Translator(object):
                         words = " ".join(words).replace(" ##", "").split()
                         if len(words) <= 3:
                             continue
-                        trigrams = [
-                            (words[i - 1], words[i], words[i + 1])
-                            for i in range(1, len(words) - 1)
-                        ]
+                        trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
                         trigram = tuple(trigrams[-1])
                         if trigram in trigrams[:-1]:
                             fail = True
@@ -999,15 +893,11 @@ class Translator(object):
             topk_ids = topk_ids.fmod(vocab_size)
 
             # Map beam_index to batch_index in the flat representation.
-            batch_index = topk_beam_index + beam_offset[
-                : topk_beam_index.size(0)
-            ].unsqueeze(1)
+            batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
             select_indices = batch_index.view(-1)
 
             # Append last prediction.
-            alive_seq = torch.cat(
-                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
-            )
+            alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)
 
             is_finished = topk_ids.eq(self.end_token)
             if step + 1 == max_length:
@@ -1040,15 +930,11 @@ class Translator(object):
                 topk_log_probs = topk_log_probs.index_select(0, non_finished)
                 batch_index = batch_index.index_select(0, non_finished)
                 batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished).view(
-                    -1, alive_seq.size(-1)
-                )
+                alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
             # Reorder states.
             select_indices = batch_index.view(-1)
             src_features = src_features.index_select(0, select_indices)
-            dec_states.map_batch_fn(
-                lambda state, dim: state.index_select(dim, select_indices)
-            )
+            dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))
 
         return results
 
@@ -1089,14 +975,7 @@ def tile(x, count, dim=0):
     out_size = list(x.size())
     out_size[0] *= count
     batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
+    x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
     if dim != 0:
         x = x.permute(perm).contiguous()
     return x
@@ -1107,6 +986,7 @@ def tile(x, count, dim=0):
 # a finetuning script.
 #
 
+
 class BertSumOptimizer(object):
     """ Specific optimizer for BertSum.
 
@@ -1126,16 +1006,10 @@ class BertSumOptimizer(object):
 
         self.optimizers = {
             "encoder": torch.optim.Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
             ),
             "decoder": torch.optim.Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
+                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
             ),
         }
 
@@ -1143,9 +1017,7 @@ class BertSumOptimizer(object):
         self.current_learning_rates = {}
 
     def _update_rate(self, stack):
-        return self.lr[stack] * min(
-            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
-        )
+        return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))
 
     def zero_grad(self):
         self.optimizer_decoder.zero_grad()
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 3c339d0c3..36210d999 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -25,9 +25,7 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
 
-Batch = namedtuple(
-    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
-)
+Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
 
 
 def evaluate(args):
@@ -48,13 +46,14 @@ def evaluate(args):
 
         import rouge
         import nltk
-        nltk.download('punkt')
+
+        nltk.download("punkt")
         rouge_evaluator = rouge.Rouge(
-            metrics=['rouge-n', 'rouge-l'],
+            metrics=["rouge-n", "rouge-l"],
             max_n=2,
             limit_length=True,
             length_limit=args.beam_size,
-            length_limit_type='words',
+            length_limit_type="words",
             apply_avg=True,
             apply_best=False,
             alpha=0.5,  # Default F1_score
@@ -161,15 +160,15 @@ Recall    >> {:.3f}
 F1        >> {:.3f}
 Precision >> {:.3f}
 Recall    >> {:.3f}""".format(
-        scores['rouge-1']['f'],
-        scores['rouge-1']['p'],
-        scores['rouge-1']['r'],
-        scores['rouge-2']['f'],
-        scores['rouge-2']['p'],
-        scores['rouge-2']['r'],
-        scores['rouge-l']['f'],
-        scores['rouge-l']['p'],
-        scores['rouge-l']['r'],
+        scores["rouge-1"]["f"],
+        scores["rouge-1"]["p"],
+        scores["rouge-1"]["r"],
+        scores["rouge-2"]["f"],
+        scores["rouge-2"]["p"],
+        scores["rouge-2"]["r"],
+        scores["rouge-l"]["f"],
+        scores["rouge-l"]["p"],
+        scores["rouge-l"]["r"],
     )
 
 
@@ -187,9 +186,7 @@ def build_data_iterator(args, tokenizer):
     dataset = load_and_cache_examples(args, tokenizer)
     sampler = SequentialSampler(dataset)
     collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
-    iterator = DataLoader(
-        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
-    )
+    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
 
     return iterator
 
@@ -210,14 +207,9 @@ def collate(data, tokenizer, block_size, device):
     names = [name for name, _, _ in data]
     summaries = [" ".join(summary_list) for _, _, summary_list in data]
 
-    encoded_text = [
-        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
-    ]
+    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
     encoded_stories = torch.tensor(
-        [
-            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
-            for story, _ in encoded_text
-        ]
+        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
     )
     encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
     encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
@@ -272,38 +264,23 @@ def main():
     )
     # EVALUATION options
     parser.add_argument(
-        "--no_cuda",
-        default=False,
-        type=bool,
-        help="Whether to force the execution on CPU.",
+        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
     )
     parser.add_argument(
         "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
     )
     # BEAM SEARCH arguments
     parser.add_argument(
-        "--min_length",
-        default=50,
-        type=int,
-        help="Minimum number of tokens for the summaries.",
+        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--max_length",
-        default=200,
-        type=int,
-        help="Maixmum number of tokens for the summaries.",
+        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
     )
     parser.add_argument(
-        "--beam_size",
-        default=5,
-        type=int,
-        help="The number of beams to start with for each example.",
+        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
     )
     parser.add_argument(
-        "--alpha",
-        default=0.95,
-        type=float,
-        help="The value of alpha for the length penalty in the beam search.",
+        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
     )
     parser.add_argument(
         "--block_trigram",
diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/utils_summarization.py
index 1d8c436ac..96470f47a 100644
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -68,9 +68,7 @@ def process_story(raw_story):
     Raises:
         IndexError: If the stoy is empty or contains no highlights.
     """
-    nonempty_lines = list(
-        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
-    )
+    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
 
     # for some unknown reason some lines miss a period, add it
     nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
@@ -135,13 +133,9 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
     sentences.
     """
     story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-    story_token_ids = [
-        token for sentence in story_lines_token_ids for token in sentence
-    ]
+    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
     summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
-    summary_token_ids = [
-        token for sentence in summary_lines_token_ids for token in sentence
-    ]
+    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
 
     return story_token_ids, summary_token_ids
 
diff --git a/examples/summarization/utils_summarization_test.py b/examples/summarization/utils_summarization_test.py
index 8bfbf6ab2..253eae388 100644
--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
@@ -33,25 +33,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
         """ Pad the sequence with 0 if the sequence is smaller than the block size."""
         sequence = [1, 2, 3, 4]
         expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_fit_exactly(self):
         """ Do nothing if the sequence is the right size. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_fit_to_block_sequence_too_big(self):
         """ Truncate the sequence if it is too long. """
         sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
         expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(
-            fit_to_block_size(sequence, self.block_size, 0), expected_output
-        )
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
 
     def test_process_story_no_highlights(self):
         """ Processing a story with no highlights returns an empty list for the summary.
@@ -95,9 +89,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
     def test_build_mask(self):
         sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
         expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(
-            build_mask(sequence, 23).numpy(), expected.numpy()
-        )
+        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
 
     def test_build_mask_with_padding_equal_to_one(self):
         sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
@@ -106,12 +98,8 @@ class SummarizationDataProcessingTest(unittest.TestCase):
 
     def test_compute_token_type_ids(self):
         separator = 101
-        batch = torch.tensor(
-            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
-        )
-        expected = torch.tensor(
-            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
-        )
+        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
+        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
 
         result = compute_token_type_ids(batch, separator)
         np.testing.assert_array_equal(result, expected)
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 632d2f728..1293559c2 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -35,34 +35,36 @@ logging.basicConfig(level=logging.DEBUG)
 
 logger = logging.getLogger()
 
+
 def get_setup_file():
     parser = argparse.ArgumentParser()
-    parser.add_argument('-f')
+    parser.add_argument("-f")
     args = parser.parse_args()
     return args.f
 
-class ExamplesTests(unittest.TestCase):
 
+class ExamplesTests(unittest.TestCase):
     def test_run_glue(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_glue.py",
-                    "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc",
-                    "--do_train",
-                    "--do_eval",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--learning_rate=1e-4",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_glue.py",
+            "--data_dir=./examples/tests_samples/MRPC/",
+            "--task_name=mrpc",
+            "--do_train",
+            "--do_eval",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--learning_rate=1e-4",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_glue.main()
             for value in result.values():
                 self.assertGreaterEqual(value, 0.75)
@@ -71,40 +73,38 @@ class ExamplesTests(unittest.TestCase):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_squad.py",
-                    "--data_dir=./examples/tests_samples/SQUAD",
-                    "--model_name=bert-base-uncased",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--do_train",
-                    "--do_eval",
-                    "--version_2_with_negative",
-                    "--learning_rate=2e-4",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_squad.py",
+            "--data_dir=./examples/tests_samples/SQUAD",
+            "--model_name=bert-base-uncased",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--do_train",
+            "--do_eval",
+            "--version_2_with_negative",
+            "--learning_rate=2e-4",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_squad.main()
-            self.assertGreaterEqual(result['f1'], 30)
-            self.assertGreaterEqual(result['exact'], 30)
+            self.assertGreaterEqual(result["f1"], 30)
+            self.assertGreaterEqual(result["exact"], 30)
 
     def test_generation(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_generation.py",
-                    "--prompt=Hello",
-                    "--length=10",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=openai-gpt",
-                                  "--model_name_or_path=openai-gpt")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
             result = run_generation.main()
             self.assertGreaterEqual(len(result), 10)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index a131a6392..492eb23e3 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -55,19 +55,10 @@ class InputExample(object):
 
 
 class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
+    def __init__(self, example_id, choices_features, label):
         self.example_id = example_id
         self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
             for input_ids, input_mask, segment_ids in choices_features
         ]
         self.label = label
@@ -99,29 +90,29 @@ class RaceProcessor(DataProcessor):
     def get_train_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} train".format(data_dir))
-        high = os.path.join(data_dir, 'train/high')
-        middle = os.path.join(data_dir, 'train/middle')
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'train')
+        return self._create_examples(high + middle, "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} dev".format(data_dir))
-        high = os.path.join(data_dir, 'dev/high')
-        middle = os.path.join(data_dir, 'dev/middle')
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'dev')
+        return self._create_examples(high + middle, "dev")
 
     def get_test_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {} test".format(data_dir))
-        high = os.path.join(data_dir, 'test/high')
-        middle = os.path.join(data_dir, 'test/middle')
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
         high = self._read_txt(high)
         middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'test')
+        return self._create_examples(high + middle, "test")
 
     def get_labels(self):
         """See base class."""
@@ -131,13 +122,12 @@ class RaceProcessor(DataProcessor):
         lines = []
         files = glob.glob(input_dir + "/*txt")
         for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, 'r', encoding='utf-8') as fin:
+            with open(file, "r", encoding="utf-8") as fin:
                 data_raw = json.load(fin)
                 data_raw["race_id"] = file
                 lines.append(data_raw)
         return lines
 
-
     def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
         examples = []
@@ -145,19 +135,22 @@ class RaceProcessor(DataProcessor):
             race_id = "%s-%s" % (set_type, data_raw["race_id"])
             article = data_raw["article"]
             for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw['answers'][i]) - ord('A'))
-                question = data_raw['questions'][i]
-                options = data_raw['options'][i]
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]
 
                 examples.append(
                     InputExample(
                         example_id=race_id,
                         question=question,
-                        contexts=[article, article, article, article], # this is not efficient but convenient
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
                         endings=[options[0], options[1], options[2], options[3]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
         return examples
 
+
 class SwagProcessor(DataProcessor):
     """Processor for the SWAG data set."""
 
@@ -179,27 +172,25 @@ class SwagProcessor(DataProcessor):
             "setting!"
         )
         return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
     def get_labels(self):
         """See base class."""
         return ["0", "1", "2", "3"]
 
     def _read_csv(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as f:
+        with open(input_file, "r", encoding="utf-8") as f:
             reader = csv.reader(f)
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                 lines.append(line)
             return lines
 
-
     def _create_examples(self, lines: List[List[str]], type: str):
         """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != 'label':
-            raise ValueError(
-                "For training, the input file must contain a label column."
-            )
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")
 
         examples = [
             InputExample(
@@ -207,10 +198,11 @@ class SwagProcessor(DataProcessor):
                 question=line[5],  # in the swag dataset, the
                 # common beginning of each
                 # choice is stored in "sent2".
-                contexts = [line[4], line[4], line[4], line[4]],
-                endings = [line[7], line[8], line[9], line[10]],
-                label=line[11]
-            ) for line in lines[1:]  # we skip the line with the column names
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
         ]
 
         return examples
@@ -238,15 +230,14 @@ class ArcProcessor(DataProcessor):
         return ["0", "1", "2", "3"]
 
     def _read_json(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as fin:
+        with open(input_file, "r", encoding="utf-8") as fin:
             lines = fin.readlines()
             return lines
 
-
     def _create_examples(self, lines, type):
         """Creates examples for the training and dev sets."""
 
-        #There are two types of labels. They should be normalized
+        # There are two types of labels. They should be normalized
         def normalize(truth):
             if truth in "ABCD":
                 return ord(truth) - ord("A")
@@ -283,12 +274,18 @@ class ArcProcessor(DataProcessor):
             if len(options) == 4:
                 examples.append(
                     InputExample(
-                        example_id = id,
+                        example_id=id,
                         question=question,
-                        contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
-                                  options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
                         endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
 
         if type == "train":
             assert len(examples) > 1
@@ -316,7 +313,7 @@ def convert_examples_to_features(
     Loads a data file into a list of `InputFeatures`
     """
 
-    label_map = {label : i for i, label in enumerate(label_list)}
+    label_map = {label: i for i, label in enumerate(label_list)}
 
     features = []
     for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
@@ -331,16 +328,13 @@ def convert_examples_to_features(
             else:
                 text_b = example.question + " " + ending
 
-            inputs = tokenizer.encode_plus(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-            )
-            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
-                logger.info('Attention! you are cropping tokens (swag task is ok). '
-                        'If you are training ARC and RACE and you are poping question + options,'
-                        'you need to try to use a bigger max seq length!')
+            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are poping question + options,"
+                    "you need to try to use a bigger max seq length!"
+                )
 
             input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
@@ -364,7 +358,6 @@ def convert_examples_to_features(
             assert len(token_type_ids) == max_length
             choices_features.append((input_ids, attention_mask, token_type_ids))
 
-
         label = label_map[example.label]
 
         if ex_index < 2:
@@ -372,33 +365,17 @@ def convert_examples_to_features(
             logger.info("race_id: {}".format(example.example_id))
             for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                 logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
                 logger.info("label: {}".format(label))
 
-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                choices_features=choices_features,
-                label=label,
-            )
-        )
+        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
 
     return features
 
 
+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
 
 
-processors = {
-    "race": RaceProcessor,
-    "swag": SwagProcessor,
-    "arc": ArcProcessor
-}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
-    "race", 4,
-    "swag", 4,
-    "arc", 4
-}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 45ddeafbd..d37583469 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -61,9 +61,7 @@ def read_examples_from_file(data_dir, mode):
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
-                                                 words=words,
-                                                 labels=labels))
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                     guid_index += 1
                     words = []
                     labels = []
@@ -76,27 +74,27 @@ def read_examples_from_file(data_dir, mode):
                     # Examples could have no label for mode = "test"
                     labels.append("O")
         if words:
-            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
-                                         words=words,
-                                         labels=labels))
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
     return examples
 
 
-def convert_examples_to_features(examples,
-                                 label_list,
-                                 max_seq_length,
-                                 tokenizer,
-                                 cls_token_at_end=False,
-                                 cls_token="[CLS]",
-                                 cls_token_segment_id=1,
-                                 sep_token="[SEP]",
-                                 sep_token_extra=False,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 pad_token_label_id=-100,
-                                 sequence_a_segment_id=0,
-                                 mask_padding_with_zero=True):
+def convert_examples_to_features(
+    examples,
+    label_list,
+    max_seq_length,
+    tokenizer,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    cls_token_segment_id=1,
+    sep_token="[SEP]",
+    sep_token_extra=False,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    pad_token_label_id=-100,
+    sequence_a_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -122,8 +120,8 @@ def convert_examples_to_features(examples,
         # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
         special_tokens_count = 3 if sep_token_extra else 2
         if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[:(max_seq_length - special_tokens_count)]
-            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+            tokens = tokens[: (max_seq_length - special_tokens_count)]
+            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:
@@ -174,10 +172,10 @@ def convert_examples_to_features(examples,
             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
             label_ids = ([pad_token_label_id] * padding_length) + label_ids
         else:
-            input_ids += ([pad_token] * padding_length)
-            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids += ([pad_token_segment_id] * padding_length)
-            label_ids += ([pad_token_label_id] * padding_length)
+            input_ids += [pad_token] * padding_length
+            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
+            segment_ids += [pad_token_segment_id] * padding_length
+            label_ids += [pad_token_label_id] * padding_length
 
         assert len(input_ids) == max_seq_length
         assert len(input_mask) == max_seq_length
@@ -194,10 +192,8 @@ def convert_examples_to_features(examples,
             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
 
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_ids=label_ids))
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
+        )
     return features
 
 
@@ -209,4 +205,4 @@ def get_labels(path):
             labels = ["O"] + labels
         return labels
     else:
-        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
diff --git a/hubconf.py b/hubconf.py
index 3fa354ed5..1d100271a 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,9 +1,15 @@
 from transformers import (
-    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+    AutoTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoModelWithLMHead,
+    AutoModelForSequenceClassification,
+    AutoModelForQuestionAnswering,
 )
 from transformers.file_utils import add_start_docstrings
 
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
+dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
+
 
 @add_start_docstrings(AutoConfig.__doc__)
 def config(*args, **kwargs):
@@ -57,6 +63,7 @@ def model(*args, **kwargs):
 
     return AutoModel.from_pretrained(*args, **kwargs)
 
+
 @add_start_docstrings(AutoModelWithLMHead.__doc__)
 def modelWithLMHead(*args, **kwargs):
     r"""
diff --git a/setup.py b/setup.py
index fe2e1526b..59dbfef12 100644
--- a/setup.py
+++ b/setup.py
@@ -38,11 +38,11 @@ from setuptools import find_packages, setup
 
 
 extras = {
-    'serving': ['pydantic', 'uvicorn', 'fastapi'],
-    'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'],
-    'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch']
+    "serving": ["pydantic", "uvicorn", "fastapi"],
+    "serving-tf": ["pydantic", "uvicorn", "fastapi", "tensorflow"],
+    "serving-torch": ["pydantic", "uvicorn", "fastapi", "torch"],
 }
-extras['all'] = [package for package in extras.values()]
+extras["all"] = [package for package in extras.values()]
 
 setup(
     name="transformers",
@@ -50,30 +50,29 @@ setup(
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
-    long_description=open("README.md", "r", encoding='utf-8').read(),
+    long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
-    license='Apache',
+    keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
+    license="Apache",
     url="https://github.com/huggingface/transformers",
-    packages=find_packages(exclude=["*.tests", "*.tests.*",
-                                    "tests.*", "tests"]),
-    install_requires=['numpy',
-                      'boto3',
-                      'filelock',
-                      'requests',
-                      'tqdm',
-                      'regex != 2019.12.17',
-                      'sentencepiece',
-                      'sacremoses'],
-    extras_require=extras,
-    scripts=[
-        'transformers-cli'
+    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
+    install_requires=[
+        "numpy",
+        "boto3",
+        "filelock",
+        "requests",
+        "tqdm",
+        "regex != 2019.12.17",
+        "sentencepiece",
+        "sacremoses",
     ],
+    extras_require=extras,
+    scripts=["transformers-cli"],
     # python_requires='>=3.5.0',
     classifiers=[
-          'Intended Audience :: Science/Research',
-          'License :: OSI Approved :: Apache Software License',
-          'Programming Language :: Python :: 3',
-          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
 )
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index 77ce587a5..64e92f2a2 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -24,8 +24,7 @@ import glob
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 
 try:
@@ -35,19 +34,32 @@ except:
 
 from tqdm import tqdm, trange
 
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForQuestionAnswering, BertTokenizer,
-                                  XLMConfig, XLMForQuestionAnswering,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    BertConfig,
+    BertForQuestionAnswering,
+    BertTokenizer,
+    XLMConfig,
+    XLMForQuestionAnswering,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+)
 
 from transformers import AdamW, get_linear_schedule_with_warmup
 
-from utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+from utils_squad import (
+    read_squad_examples,
+    convert_examples_to_features,
+    RawResult,
+    write_predictions,
+    RawResultExtended,
+    write_predictions_extended,
+)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -56,16 +68,18 @@ from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 
+
 def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -73,9 +87,11 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -92,13 +108,18 @@ def train(args, train_dataset, model, tokenizer):
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
     if args.fp16:
         try:
             from apex import amp
@@ -112,17 +133,21 @@ def train(args, train_dataset, model, tokenizer):
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -136,20 +161,21 @@ def train(args, train_dataset, model, tokenizer):
         for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      'start_positions': batch[3],
-                      'end_positions':   batch[4]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -173,22 +199,26 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
@@ -224,32 +254,31 @@ def evaluate(args, model, tokenizer, prefix=""):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
             outputs = model(**inputs)
 
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                 # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                result = RawResultExtended(
+                    unique_id=unique_id,
+                    start_top_log_probs=to_list(outputs[0][i]),
+                    start_top_index=to_list(outputs[1][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
             else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                result = RawResult(
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
+                )
             all_results.append(result)
 
     # Compute predictions
@@ -260,23 +289,44 @@ def evaluate(args, model, tokenizer, prefix=""):
     else:
         output_null_log_odds_file = None
 
-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
         # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+        write_predictions_extended(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
     else:
-        write_predictions(examples, features, all_results, args.n_best_size,
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+        write_predictions(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )
 
     # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
+    evaluate_options = EVAL_OPTS(
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
+    )
     results = evaluate_on_squad(evaluate_options)
     return results
 
@@ -287,24 +337,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        examples = read_squad_examples(
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
+        )
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -320,14 +376,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
     else:
         all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
+        dataset = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )
 
     if output_examples:
         return dataset, examples, features
@@ -338,109 +401,190 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
 
     ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-
-    parser.add_argument('--version_2_with_negative', action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
 
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
+
         print("Waiting for debugger attach")
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
@@ -452,16 +596,24 @@ def main():
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
         args.n_gpu = 1
     args.device = device
 
     # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
 
     # Set seed
     set_seed(args)
@@ -472,15 +624,21 @@ def main():
 
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
-    model = model_class.from_pretrained(args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
-                                        config=config,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -495,7 +653,8 @@ def main():
     if args.fp16:
         try:
             import apex
-            apex.amp.register_half_function(torch, 'einsum')
+
+            apex.amp.register_half_function(torch, "einsum")
         except ImportError:
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 
@@ -505,7 +664,6 @@ def main():
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-
     # Save the trained model and the tokenizer
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
@@ -515,39 +673,42 @@ def main():
         logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
-
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
             # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
 
             # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
 
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
             results.update(result)
 
     logger.info("Results: {}".format(results))
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index 3f4145e02..bd016bd30 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -1,4 +1,3 @@
-
 # coding=utf-8
 # Copyright 2018 XXX.  All rights reserved.
 #
@@ -37,14 +36,16 @@ class SquadExample(object):
     For examples without an answer, the start and end position are -1.
     """
 
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        doc_tokens,
+        orig_answer_text=None,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
         self.qas_id = qas_id
         self.question_text = question_text
         self.doc_tokens = doc_tokens
@@ -59,8 +60,7 @@ class SquadExample(object):
     def __repr__(self):
         s = ""
         s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
+        s += ", question_text: %s" % (self.question_text)
         s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
         if self.start_position:
             s += ", start_position: %d" % (self.start_position)
@@ -74,22 +74,24 @@ class SquadExample(object):
 class InputFeatures(object):
     """A single set of features of data."""
 
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 cls_index,
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
+    def __init__(
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        cls_index,
+        p_mask,
+        paragraph_len,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
         self.unique_id = unique_id
         self.example_index = example_index
         self.doc_span_index = doc_span_index
@@ -109,7 +111,7 @@ class InputFeatures(object):
 
 def read_squad_examples(input_file, is_training, version_2_with_negative):
     """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
+    with open(input_file, "r", encoding="utf-8") as reader:
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
@@ -146,8 +148,7 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                     if version_2_with_negative:
                         is_impossible = qa["is_impossible"]
                     if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
+                        raise ValueError("For training, each question should have exactly 1 answer.")
                     if not is_impossible:
                         answer = qa["answers"][0]
                         orig_answer_text = answer["text"]
@@ -161,12 +162,10 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                         #
                         # Note that this means for training mode, every example is NOT
                         # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
+                        actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
+                        cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
                         if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
+                            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                             continue
                     else:
                         start_position = -1
@@ -180,18 +179,29 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                     orig_answer_text=orig_answer_text,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=is_impossible)
+                    is_impossible=is_impossible,
+                )
                 examples.append(example)
     return examples
 
 
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True):
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """Loads a data file into a list of `InputBatch`s."""
 
     unique_id = 1000000000
@@ -232,8 +242,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
             else:
                 tok_end_position = len(all_doc_tokens) - 1
             (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
+            )
 
         # The -3 accounts for [CLS], [SEP] and [SEP]
         max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
@@ -241,8 +251,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         # We can have documents that are longer than the maximum sequence length.
         # To deal with this we do a sliding window approach, where we take chunks
         # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])  # pylint: disable=invalid-name
         doc_spans = []
         start_offset = 0
         while start_offset < len(all_doc_tokens):
@@ -287,8 +296,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 split_token_index = doc_span.start + i
                 token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
 
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
                 token_is_max_context[len(tokens)] = is_max_context
                 tokens.append(all_doc_tokens[split_token_index])
                 segment_ids.append(sequence_b_segment_id)
@@ -333,8 +341,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 doc_start = doc_span.start
                 doc_end = doc_span.start + doc_span.length - 1
                 out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                     out_of_span = True
                 if out_of_span:
                     start_position = 0
@@ -355,24 +362,23 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 logger.info("example_index: %s" % (example_index))
                 logger.info("doc_span_index: %s" % (doc_span_index))
                 logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                 logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
                 logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                 if is_training and span_is_impossible:
                     logger.info("impossible example")
                 if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
                     logger.info("start_position: %d" % (start_position))
                     logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
+                    logger.info("answer: %s" % (answer_text))
 
             features.append(
                 InputFeatures(
@@ -390,14 +396,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                     paragraph_len=paragraph_len,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=span_is_impossible))
+                    is_impossible=span_is_impossible,
+                )
+            )
             unique_id += 1
 
     return features
 
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
     """Returns tokenized answer spans that better match the annotated answer."""
 
     # The SQuAD annotations are character based. We first project them to
@@ -426,7 +433,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
 
     for new_start in range(input_start, input_end + 1):
         for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
@@ -470,13 +477,23 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
     return cur_span_index == best_span_index
 
 
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
+
+def write_predictions(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+):
     """Write final predictions to the json file and log-odds of null if needed."""
     logger.info("Writing predictions to: %s" % (output_prediction_file))
     logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -490,8 +507,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         unique_id_to_result[result.unique_id] = result
 
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
 
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
@@ -544,7 +561,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                             start_index=start_index,
                             end_index=end_index,
                             start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
         if version_2_with_negative:
             prelim_predictions.append(
                 _PrelimPrediction(
@@ -552,14 +571,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                     start_index=0,
                     end_index=0,
                     start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
 
         seen_predictions = {}
         nbest = []
@@ -568,10 +587,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 break
             feature = features[pred.feature_index]
             if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
                 tok_text = " ".join(tok_tokens)
 
                 # De-tokenize WordPieces that have been split off.
@@ -592,31 +611,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 final_text = ""
                 seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
         # if we didn't include the empty option in the n-best, include it
         if version_2_with_negative:
             if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-                
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
-                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         assert len(nbest) >= 1
 
@@ -645,8 +654,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
             all_predictions[example.qas_id] = nbest_json[0]["text"]
         else:
             # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
             scores_diff_json[example.qas_id] = score_diff
             if score_diff > null_score_diff_threshold:
                 all_predictions[example.qas_id] = ""
@@ -668,29 +676,40 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
 
 
 # For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple("RawResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index",
-     "end_top_log_probs", "end_top_index", "cls_logits"])
-
-
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
-                                output_nbest_file,
-                                output_null_log_odds_file, orig_data_file,
-                                start_n_top, end_n_top, version_2_with_negative,
-                                tokenizer, verbose_logging):
+RawResultExtended = collections.namedtuple(
+    "RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits"],
+)
+
+
+def write_predictions_extended(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    orig_data_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
 
         Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
 
     logger.info("Writing predictions to: %s", output_prediction_file)
     # logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -754,12 +773,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
                             start_index=start_index,
                             end_index=end_index,
                             start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
+                            end_log_prob=end_log_prob,
+                        )
+                    )
 
         prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
-            reverse=True)
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
 
         seen_predictions = {}
         nbest = []
@@ -770,7 +790,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
 
             # XLNet un-tokenizer
             # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
             # tok_start_to_orig_index = feature.tok_start_to_orig_index
             # tok_end_to_orig_index = feature.tok_end_to_orig_index
             # start_orig_pos = tok_start_to_orig_index[pred.start_index]
@@ -779,10 +799,10 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
 
             # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
             orig_doc_start = feature.token_to_orig_map[pred.start_index]
             orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
             tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
             # Clean whitespace
@@ -790,8 +810,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
-                                        verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
@@ -799,17 +818,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
             seen_predictions[final_text] = True
 
             nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
 
         total_scores = []
         best_non_null_entry = None
@@ -850,7 +865,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
         with open(output_null_log_odds_file, "w") as writer:
             writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
+    with open(orig_data_file, "r", encoding="utf-8") as reader:
         orig_data = json.load(reader)["data"]
 
     qid_to_has_ans = make_qid_to_has_ans(orig_data)
@@ -914,8 +929,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -924,8 +938,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -956,7 +969,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
             logger.info("Couldn't map end position")
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 12d69799a..370fbb569 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -27,8 +27,8 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
 }
 
 
@@ -63,24 +63,26 @@ class XxxConfig(PretrainedConfig):
     """
     pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=50257,
-                 n_positions=1024,
-                 n_ctx=1024,
-                 n_embd=768,
-                 n_layer=12,
-                 n_head=12,
-                 resid_pdrop=0.1,
-                 embd_pdrop=0.1,
-                 attn_pdrop=0.1,
-                 layer_norm_epsilon=1e-5,
-                 initializer_range=0.02,
-                 summary_type='cls_index',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
         super(XxxConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index 9d389deaa..99d376149 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = XxxConfig.from_json_file(config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 178362099..a4477704a 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
 }
 
 ####################################################
@@ -69,9 +69,9 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 class TFXxxLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXxxLayer, self).__init__(**kwargs)
-        self.attention = TFXxxAttention(config, name='attention')
-        self.intermediate = TFXxxIntermediate(config, name='intermediate')
-        self.transformer_output = TFXxxOutput(config, name='output')
+        self.attention = TFXxxAttention(config, name="attention")
+        self.intermediate = TFXxxIntermediate(config, name="intermediate")
+        self.transformer_output = TFXxxOutput(config, name="output")
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -98,7 +98,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+    def call(
+        self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False
+    ):
         # We allow three types of multi-inputs:
         # - traditional keyword arguments in the call method
         # - all the arguments provided as a dict in the first positional argument of call
@@ -113,11 +115,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
             head_mask = inputs[4] if len(inputs) > 4 else head_mask
             assert len(inputs) <= 5, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
             assert len(inputs) <= 5, "Too many inputs."
         else:
             input_ids = inputs
@@ -175,6 +177,7 @@ class TFXxxPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XxxConfig
     pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -263,8 +266,12 @@ XXX_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxModel(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -297,17 +304,19 @@ class TFXxxModel(TFXxxPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class TFXxxForMaskedLM(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -333,26 +342,30 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
+        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -378,22 +391,23 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -401,9 +415,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForTokenClassification(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -429,22 +446,23 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -452,9 +470,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -482,14 +503,15 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 4c325196e..7270376ec 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
 }
 
 ####################################################
@@ -60,8 +60,10 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,7 +78,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -84,18 +86,18 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
             continue
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -105,9 +107,9 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -147,7 +149,6 @@ class XxxLayer(nn.Module):
         return outputs
 
 
-
 ####################################################
 # PreTrainedModel is a sub-class of torch.nn.Module
 # which take care of loading and saving pretrained weights
@@ -161,6 +162,7 @@ class XxxPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XxxConfig
     pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xxx
@@ -246,8 +248,12 @@ XXX_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxModel(XxxPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -277,6 +283,7 @@ class XxxModel(XxxPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XxxModel, self).__init__(config)
 
@@ -300,7 +307,15 @@ class XxxModel(XxxPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -329,7 +344,7 @@ class XxxModel(XxxPreTrainedModel):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -342,14 +357,20 @@ class XxxModel(XxxPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
         ##################################
         # Replace this with your model code
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
         encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
         sequence_output = encoder_outputs[0]
         outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
@@ -357,8 +378,9 @@ class XxxModel(XxxPreTrainedModel):
         return outputs  # sequence_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class XxxForMaskedLM(XxxPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -389,6 +411,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForMaskedLM, self).__init__(config)
 
@@ -400,15 +423,25 @@ class XxxForMaskedLM(XxxPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -422,9 +455,12 @@ class XxxForMaskedLM(XxxPreTrainedModel):
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForSequenceClassification(XxxPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -456,6 +492,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -466,15 +503,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -496,9 +543,12 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForTokenClassification(XxxPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +578,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XxxForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -538,15 +589,25 @@ class XxxForTokenClassification(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -569,9 +630,12 @@ class XxxForTokenClassification(XxxPreTrainedModel):
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForQuestionAnswering(XxxPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -613,6 +677,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(XxxForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -622,15 +687,26 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 6eba932a8..1e4f64042 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,46 +27,57 @@ from transformers import XxxConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_xxx import (TFXxxModel, TFXxxForMaskedLM,
-                                               TFXxxForSequenceClassification,
-                                               TFXxxForTokenClassification,
-                                               TFXxxForQuestionAnswering,
-                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_xxx import (
+        TFXxxModel,
+        TFXxxForMaskedLM,
+        TFXxxForSequenceClassification,
+        TFXxxForTokenClassification,
+        TFXxxForQuestionAnswering,
+        TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
-                         TFXxxForSequenceClassification,
-                         TFXxxForTokenClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFXxxModel,
+            TFXxxForMaskedLM,
+            TFXxxForQuestionAnswering,
+            TFXxxForSequenceClassification,
+            TFXxxForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
 
     class TFXxxModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -120,15 +131,16 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -141,78 +153,74 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFXxxForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
-
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFXxxForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
-
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFXxxForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -244,9 +252,10 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ['xxx-base-uncased']:
+        for model_name in ["xxx-base-uncased"]:
             model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 5e22392d0..2043d7965 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -20,51 +20,60 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
-                                        XxxForNextSentencePrediction, XxxForPreTraining,
-                                        XxxForQuestionAnswering, XxxForSequenceClassification,
-                                        XxxForTokenClassification, XxxForMultipleChoice)
+    from transformers import (
+        XxxConfig,
+        XxxModel,
+        XxxForMaskedLM,
+        XxxForNextSentencePrediction,
+        XxxForPreTraining,
+        XxxForQuestionAnswering,
+        XxxForSequenceClassification,
+        XxxForTokenClassification,
+        XxxForMultipleChoice,
+    )
     from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
-                         XxxForSequenceClassification,
-                         XxxForTokenClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
 
     class XxxModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -118,16 +127,17 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -140,83 +150,98 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = XxxForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = XxxForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = XxxForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
-
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -252,5 +277,6 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
             model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tests/tokenization_xxx_test.py b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
index 116083edc..940de5c76 100644
--- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
@@ -18,10 +18,11 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES)
+from transformers.tokenization_bert import XxxTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = XxxTokenizer
@@ -30,28 +31,39 @@ class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(XxxTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ",", "low", "lowest",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
         return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"UNwant\u00E9d,running"
-        output_text = u"unwanted, running"
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 7a10a41e5..c1ea93a6d 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -34,17 +34,16 @@ logger = logging.getLogger(__name__)
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to pretrained vocabulary URL for all the model shortcut names.
 ####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
-        'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
+    "vocab_file": {
+        "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
+        "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
     }
 }
 
@@ -52,8 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xxx-base-uncased': 512,
-    'xxx-large-uncased': 512,
+    "xxx-base-uncased": 512,
+    "xxx-large-uncased": 512,
 }
 
 ####################################################
@@ -62,8 +61,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 # To be used for checkpoint specific configurations.
 ####################################################
 PRETRAINED_INIT_CONFIGURATION = {
-    'xxx-base-uncased': {'do_lower_case': True},
-    'xxx-large-uncased': {'do_lower_case': True},
+    "xxx-base-uncased": {"do_lower_case": True},
+    "xxx-large-uncased": {"do_lower_case": True},
 }
 
 
@@ -73,7 +72,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
+        token = token.rstrip("\n")
         vocab[token] = index
     return vocab
 
@@ -93,9 +92,17 @@ class XxxTokenizer(PreTrainedTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True,
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
         """Constructs a XxxTokenizer.
 
         Args:
@@ -104,16 +111,22 @@ class XxxTokenizer(PreTrainedTokenizer):
                 Whether to lower case the input
                 Only has an effect when do_basic_tokenize=True
         """
-        super(XxxTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                           pad_token=pad_token, cls_token=cls_token,
-                                           mask_token=mask_token, **kwargs)
+        super(XxxTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
 
     @property
@@ -142,7 +155,7 @@ class XxxTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -177,8 +190,10 @@ class XxxTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -204,15 +219,17 @@ class XxxTokenizer(PreTrainedTokenizer):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
         else:
             vocab_file = vocab_path
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
                     index = token_index
-                writer.write(token + u'\n')
+                writer.write(token + "\n")
                 index += 1
         return (vocab_file,)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 017fe476e..318cd5ce4 100755
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -6,8 +6,9 @@ __version__ = "2.3.0"
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
     import absl.logging
-    absl.logging.set_verbosity('info')
-    absl.logging.set_stderrthreshold('info')
+
+    absl.logging.set_verbosity("info")
+    absl.logging.set_stderrthreshold("info")
     absl.logging._warn_preinit_stderr = False
 except:
     pass
@@ -17,19 +18,41 @@ import logging
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 # Files and general utilities
-from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
-                         cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
-                         is_tf_available, is_torch_available)
-
-from .data import (is_sklearn_available,
-                   InputExample, InputFeatures, DataProcessor,
-                   SingleSentenceClassificationProcessor,
-                   glue_output_modes, glue_convert_examples_to_features,
-                   glue_processors, glue_tasks_num_labels,
-                   xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
-                   squad_convert_examples_to_features, SquadFeatures, 
-                   SquadExample, SquadV1Processor, SquadV2Processor)
+from .file_utils import (
+    TRANSFORMERS_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    cached_path,
+    add_start_docstrings,
+    add_end_docstrings,
+    WEIGHTS_NAME,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    is_tf_available,
+    is_torch_available,
+)
+
+from .data import (
+    is_sklearn_available,
+    InputExample,
+    InputFeatures,
+    DataProcessor,
+    SingleSentenceClassificationProcessor,
+    glue_output_modes,
+    glue_convert_examples_to_features,
+    glue_processors,
+    glue_tasks_num_labels,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+    squad_convert_examples_to_features,
+    SquadFeatures,
+    SquadExample,
+    SquadV1Processor,
+    SquadV2Processor,
+)
 
 if is_sklearn_available():
     from .data import glue_compute_metrics, xnli_compute_metrics
@@ -38,12 +61,12 @@ if is_sklearn_available():
 from .modelcard import ModelCard
 
 # Tokenizers
-from .tokenization_utils import (PreTrainedTokenizer)
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLCorpus
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
@@ -75,143 +98,281 @@ from .configuration_mmbt import MMBTConfig
 
 # Modeling
 if is_torch_available():
-    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
-    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
-                                BertForMaskedLM, BertForNextSentencePrediction,
-                                BertForSequenceClassification, BertForMultipleChoice,
-                                BertForTokenClassification, BertForQuestionAnswering,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                    AdaptiveEmbedding,
-                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
-                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
-                                CTRLLMHeadModel,
-                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                                XLNetForSequenceClassification, XLNetForTokenClassification,
-                                XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
-                                XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
-                                XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
-                            XLMWithLMHeadModel, XLMForSequenceClassification,
-                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
-                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
-                                RobertaForSequenceClassification, RobertaForMultipleChoice,
-                                RobertaForTokenClassification, RobertaForQuestionAnswering,
-                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
-                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
-                                DistilBertForTokenClassification,
-                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
-                                CamembertForSequenceClassification, CamembertForMultipleChoice,
-                                CamembertForTokenClassification,
-                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
+    from .modeling_auto import (
+        AutoModel,
+        AutoModelForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        AutoModelWithLMHead,
+        AutoModelForTokenClassification,
+        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_bert import (
+        BertPreTrainedModel,
+        BertModel,
+        BertForPreTraining,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForSequenceClassification,
+        BertForMultipleChoice,
+        BertForTokenClassification,
+        BertForQuestionAnswering,
+        load_tf_weights_in_bert,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_openai import (
+        OpenAIGPTPreTrainedModel,
+        OpenAIGPTModel,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+        load_tf_weights_in_openai_gpt,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_transfo_xl import (
+        TransfoXLPreTrainedModel,
+        TransfoXLModel,
+        TransfoXLLMHeadModel,
+        AdaptiveEmbedding,
+        load_tf_weights_in_transfo_xl,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_gpt2 import (
+        GPT2PreTrainedModel,
+        GPT2Model,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+        load_tf_weights_in_gpt2,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_xlnet import (
+        XLNetPreTrainedModel,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForMultipleChoice,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForQuestionAnswering,
+        load_tf_weights_in_xlnet,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm import (
+        XLMPreTrainedModel,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForSequenceClassification,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_roberta import (
+        RobertaForMaskedLM,
+        RobertaModel,
+        RobertaForSequenceClassification,
+        RobertaForMultipleChoice,
+        RobertaForTokenClassification,
+        RobertaForQuestionAnswering,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_distilbert import (
+        DistilBertPreTrainedModel,
+        DistilBertForMaskedLM,
+        DistilBertModel,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForTokenClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_camembert import (
+        CamembertForMaskedLM,
+        CamembertModel,
+        CamembertForSequenceClassification,
+        CamembertForMultipleChoice,
+        CamembertForTokenClassification,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
-    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
-                              load_tf_weights_in_t5,
-                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
-                                AlbertForQuestionAnswering,
-                                load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
-                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
+    from .modeling_t5 import (
+        T5PreTrainedModel,
+        T5Model,
+        T5WithLMHeadModel,
+        load_tf_weights_in_t5,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_albert import (
+        AlbertPreTrainedModel,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+        load_tf_weights_in_albert,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm_roberta import (
+        XLMRobertaForMaskedLM,
+        XLMRobertaModel,
+        XLMRobertaForMultipleChoice,
+        XLMRobertaForSequenceClassification,
+        XLMRobertaForTokenClassification,
+    )
     from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
 
     # Optimization
-    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
-                               get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
+    from .optimization import (
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
 
 
 # TensorFlow
 if is_tf_available():
     from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
-    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
-                                   TFBertModel, TFBertForPreTraining,
-                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
-                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
-                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
-                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
-                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
-                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
-                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
-                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
-                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
-                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
-                                    TFXLNetModel, TFXLNetLMHeadModel,
-                                    TFXLNetForSequenceClassification,
-                                    TFXLNetForTokenClassification,
-                                    TFXLNetForQuestionAnsweringSimple,
-                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
-                                  TFXLMModel, TFXLMWithLMHeadModel,
-                                  TFXLMForSequenceClassification,
-                                  TFXLMForQuestionAnsweringSimple,
-                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
-                                      TFRobertaModel, TFRobertaForMaskedLM,
-                                      TFRobertaForSequenceClassification,
-                                      TFRobertaForTokenClassification,
-                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
-                                         TFDistilBertModel, TFDistilBertForMaskedLM,
-                                         TFDistilBertForSequenceClassification,
-                                         TFDistilBertForTokenClassification,
-                                         TFDistilBertForQuestionAnswering,
-                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_ctrl import (TFCTRLPreTrainedModel, TFCTRLModel,
-                                    TFCTRLLMHeadModel,
-                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
-                                     TFAlbertForSequenceClassification,
-                                    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
-                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelWithLMHead,
+        TFAutoModelForTokenClassification,
+        TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_bert import (
+        TFBertPreTrainedModel,
+        TFBertMainLayer,
+        TFBertEmbeddings,
+        TFBertModel,
+        TFBertForPreTraining,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_gpt2 import (
+        TFGPT2PreTrainedModel,
+        TFGPT2MainLayer,
+        TFGPT2Model,
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_openai import (
+        TFOpenAIGPTPreTrainedModel,
+        TFOpenAIGPTMainLayer,
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_transfo_xl import (
+        TFTransfoXLPreTrainedModel,
+        TFTransfoXLMainLayer,
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_xlnet import (
+        TFXLNetPreTrainedModel,
+        TFXLNetMainLayer,
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_xlm import (
+        TFXLMPreTrainedModel,
+        TFXLMMainLayer,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_roberta import (
+        TFRobertaPreTrainedModel,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_distilbert import (
+        TFDistilBertPreTrainedModel,
+        TFDistilBertMainLayer,
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertForQuestionAnswering,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_ctrl import (
+        TFCTRLPreTrainedModel,
+        TFCTRLModel,
+        TFCTRLLMHeadModel,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_albert import (
+        TFAlbertPreTrainedModel,
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+    from .modeling_tf_t5 import TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
     # Optimization
-    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
+    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
 
 # TF 2.0 <=> PyTorch conversion utilities
-from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
-                                        load_pytorch_checkpoint_in_tf2_model,
-                                        load_pytorch_weights_in_tf2_model,
-                                        load_pytorch_model_in_tf2_model,
-                                        load_tf2_checkpoint_in_pytorch_model,
-                                        load_tf2_weights_in_pytorch_model,
-                                        load_tf2_model_in_pytorch_model)
+from .modeling_tf_pytorch_utils import (
+    convert_tf_weight_name_to_pt_weight_name,
+    load_pytorch_checkpoint_in_tf2_model,
+    load_pytorch_weights_in_tf2_model,
+    load_pytorch_model_in_tf2_model,
+    load_tf2_checkpoint_in_pytorch_model,
+    load_tf2_weights_in_pytorch_model,
+    load_tf2_model_in_pytorch_model,
+)
 
 # Pipelines
-from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \
-    Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline
+from .pipelines import (
+    pipeline,
+    PipelineDataFormat,
+    CsvPipelineDataFormat,
+    JsonPipelineDataFormat,
+    PipedPipelineDataFormat,
+    Pipeline,
+    FeatureExtractionPipeline,
+    QuestionAnsweringPipeline,
+    NerPipeline,
+    TextClassificationPipeline,
+)
 
 if not is_tf_available() and not is_torch_available():
-    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
-                   "Models won't be available and only tokenizers, configuration"
-                   "and file/data utilities can be used.")
+    logger.warning(
+        "Neither PyTorch nor TensorFlow >= 2.0 have been found."
+        "Models won't be available and only tokenizers, configuration"
+        "and file/data utilities can be used."
+    )
diff --git a/transformers/__main__.py b/transformers/__main__.py
index dd259b04e..3cabdd4ff 100644
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
@@ -1,16 +1,21 @@
 # coding: utf8
 
+
 def main():
     import sys
+
     if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]:
         print(
-        "First argument to `transformers` command line interface should be one of: \n"
-        ">> convert serve train predict")
+            "First argument to `transformers` command line interface should be one of: \n"
+            ">> convert serve train predict"
+        )
     if sys.argv[1] == "convert":
         from transformers.commands import convert
+
         convert(sys.argv)
     elif sys.argv[1] == "train":
         from transformers.commands import train
+
         train(sys.argv)
     elif sys.argv[1] == "serve":
         pass
@@ -19,7 +24,6 @@ def main():
         # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
         # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
 
-
         # # Register commands
         # ServeCommand.register_subcommand(commands_parser)
 
@@ -33,5 +37,6 @@ def main():
         # service = args.func(args)
         # service.run()
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/transformers/commands/__init__.py b/transformers/commands/__init__.py
index bbdd5655f..13171f428 100644
--- a/transformers/commands/__init__.py
+++ b/transformers/commands/__init__.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser
 
+
 class BaseTransformersCLICommand(ABC):
     @staticmethod
     @abstractmethod
diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py
index 55dbf5373..e358d8532 100644
--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
@@ -11,12 +11,12 @@ def convert_command_factory(args: Namespace):
     Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
     :return: ServeCommand
     """
-    return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output,
-                          args.config, args.finetuning_task_name)
+    return ConvertCommand(
+        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
+    )
 
 
 class ConvertCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -24,25 +24,39 @@ class ConvertCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original "
-                                                         "author checkpoints to Transformesr PyTorch checkpoints.")
-        train_parser.add_argument('--model_type', type=str, required=True,
-                                  help='Model\'s type.')
-        train_parser.add_argument('--tf_checkpoint', type=str, required=True,
-                                  help='TensorFlow checkpoint path or folder.')
-        train_parser.add_argument('--pytorch_dump_output', type=str, required=True,
-                                  help='Path to the PyTorch savd model output.')
-        train_parser.add_argument('--config', type=str, default="",
-                                  help='Configuration file path or folder.')
-        train_parser.add_argument('--finetuning_task_name', type=str, default=None,
-                                  help='Optional fine-tuning task name if the TF model was a finetuned model.')
+        train_parser = parser.add_parser(
+            "convert",
+            help="CLI tool to run convert model from original "
+            "author checkpoints to Transformesr PyTorch checkpoints.",
+        )
+        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
+        train_parser.add_argument(
+            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
+        )
+        train_parser.add_argument(
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
+        )
+        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
+        train_parser.add_argument(
+            "--finetuning_task_name",
+            type=str,
+            default=None,
+            help="Optional fine-tuning task name if the TF model was a finetuned model.",
+        )
         train_parser.set_defaults(func=convert_command_factory)
 
-    def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str,
-                 config: str, finetuning_task_name: str, *args):
-        self._logger = getLogger('transformers-cli/converting')
+    def __init__(
+        self,
+        model_type: str,
+        tf_checkpoint: str,
+        pytorch_dump_output: str,
+        config: str,
+        finetuning_task_name: str,
+        *args
+    ):
+        self._logger = getLogger("transformers-cli/converting")
 
-        self._logger.info('Loading model {}'.format(model_type))
+        self._logger.info("Loading model {}".format(model_type))
         self._model_type = model_type
         self._tf_checkpoint = tf_checkpoint
         self._pytorch_dump_output = pytorch_dump_output
@@ -52,63 +66,80 @@ class ConvertCommand(BaseTransformersCLICommand):
     def run(self):
         if self._model_type == "bert":
             try:
-                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
             convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "gpt":
-            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint,
-                                                    self._config,
-                                                    self._pytorch_dump_output)
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
+                convert_openai_checkpoint_to_pytorch,
+            )
+
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "transfo_xl":
             try:
-                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
+                    convert_transfo_xl_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
-            if 'ckpt' in self._tf_checkpoint.lower():
+            if "ckpt" in self._tf_checkpoint.lower():
                 TF_CHECKPOINT = self._tf_checkpoint
                 TF_DATASET_FILE = ""
             else:
                 TF_DATASET_FILE = self._tf_checkpoint
                 TF_CHECKPOINT = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT,
-                                                        self._config,
-                                                        self._pytorch_dump_output,
-                                                        TF_DATASET_FILE)
+            convert_transfo_xl_checkpoint_to_pytorch(
+                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
+            )
         elif self._model_type == "gpt2":
             try:
-                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
+                    convert_gpt2_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
             convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         elif self._model_type == "xlnet":
             try:
-                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
+                    convert_xlnet_checkpoint_to_pytorch,
+                )
             except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions."
+                )
                 raise ImportError(msg)
 
-            convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint,
-                                                self._config,
-                                                self._pytorch_dump_output,
-                                                self._finetuning_task_name)
+            convert_xlnet_checkpoint_to_pytorch(
+                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
+            )
         elif self._model_type == "xlm":
-            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
+                convert_xlm_checkpoint_to_pytorch,
+            )
 
             convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
         else:
diff --git a/transformers/commands/download.py b/transformers/commands/download.py
index 0938f135d..acfb3eeb9 100644
--- a/transformers/commands/download.py
+++ b/transformers/commands/download.py
@@ -8,13 +8,16 @@ def download_command_factory(args):
 
 
 class DownloadCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser('download')
-        download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models')
-        download_parser.add_argument('--force',  action='store_true', help='Force the model to be download even if already in cache-dir')
-        download_parser.add_argument('model', type=str, help='Name of the model to download')
+        download_parser = parser.add_parser("download")
+        download_parser.add_argument(
+            "--cache-dir", type=str, default=None, help="Path to location to store the models"
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
         download_parser.set_defaults(func=download_command_factory)
 
     def __init__(self, model: str, cache: str, force: bool):
@@ -26,4 +29,4 @@ class DownloadCommand(BaseTransformersCLICommand):
         from transformers import AutoModel, AutoTokenizer
 
         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
-        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
\ No newline at end of file
+        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
diff --git a/transformers/commands/run.py b/transformers/commands/run.py
index df03cee9d..617226306 100644
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
@@ -10,52 +10,72 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 def try_infer_format_from_ext(path: str):
     if not path:
-        return 'pipe'
+        return "pipe"
 
     for ext in PipelineDataFormat.SUPPORTED_FORMATS:
         if path.endswith(ext):
             return ext
 
     raise Exception(
-        'Unable to determine file format from file extension {}. '
-        'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+        "Unable to determine file format from file extension {}. "
+        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
     )
 
 
 def run_command_factory(args):
-    nlp = pipeline(task=args.task,
-                   model=args.model if args.model else None,
-                   config=args.config,
-                   tokenizer=args.tokenizer,
-                   device=args.device)
-    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
-    reader = PipelineDataFormat.from_str(format=format,
-                                         output_path=args.output,
-                                         input_path=args.input,
-                                         column=args.column if args.column else nlp.default_input_names,
-                                         overwrite=args.overwrite)
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
+    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
+    reader = PipelineDataFormat.from_str(
+        format=format,
+        output_path=args.output,
+        input_path=args.input,
+        column=args.column if args.column else nlp.default_input_names,
+        overwrite=args.overwrite,
+    )
     return RunCommand(nlp, reader)
 
 
 class RunCommand(BaseTransformersCLICommand):
-
     def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
         self._nlp = nlp
         self._reader = reader
 
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
-        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
-        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
-        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
-        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
-        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
-        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
-        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
-        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
-        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
-        run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.')
+        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
+        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
+        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
+        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
+        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
+        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
+        run_parser.add_argument(
+            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
+        )
+        run_parser.add_argument(
+            "--column",
+            type=str,
+            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
+        )
+        run_parser.add_argument(
+            "--format",
+            type=str,
+            default="infer",
+            choices=PipelineDataFormat.SUPPORTED_FORMATS,
+            help="Input format to read from",
+        )
+        run_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
         run_parser.set_defaults(func=run_command_factory)
 
     def run(self):
@@ -71,9 +91,6 @@ class RunCommand(BaseTransformersCLICommand):
         # Saving data
         if self._nlp.binary_output:
             binary_path = self._reader.save_binary(outputs)
-            logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path))
+            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
         else:
             self._reader.save(outputs)
-
-
-
diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py
index 4f41f797d..f7729c0bf 100644
--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
@@ -7,6 +7,7 @@ try:
     from uvicorn import run
     from fastapi import FastAPI, HTTPException, Body
     from pydantic import BaseModel
+
     _serve_dependancies_installed = True
 except (ImportError, AttributeError):
     BaseModel = object
@@ -17,18 +18,21 @@ from transformers import Pipeline
 from transformers.commands import BaseTransformersCLICommand
 from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
-logger = logging.getLogger('transformers-cli/serving')
+logger = logging.getLogger("transformers-cli/serving")
+
 
 def serve_command_factory(args: Namespace):
     """
     Factory function used to instantiate serving server from provided command line arguments.
     :return: ServeCommand
     """
-    nlp = pipeline(task=args.task,
-                   model=args.model if args.model else None,
-                   config=args.config,
-                   tokenizer=args.tokenizer,
-                   device=args.device)
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
     return ServeCommand(nlp, args.host, args.port)
 
 
@@ -36,6 +40,7 @@ class ServeModelInfoResult(BaseModel):
     """
     Expose model information
     """
+
     infos: dict
 
 
@@ -43,6 +48,7 @@ class ServeTokenizeResult(BaseModel):
     """
     Tokenize result model
     """
+
     tokens: List[str]
     tokens_ids: Optional[List[int]]
 
@@ -51,6 +57,7 @@ class ServeDeTokenizeResult(BaseModel):
     """
     DeTokenize result model
     """
+
     text: str
 
 
@@ -58,11 +65,11 @@ class ServeForwardResult(BaseModel):
     """
     Forward result model
     """
+
     output: Any
 
 
 class ServeCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -70,14 +77,23 @@ class ServeCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.')
-        serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on')
-        serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.')
-        serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.')
-        serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.')
-        serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.')
-        serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.')
-        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        serve_parser = parser.add_parser(
+            "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
+        )
+        serve_parser.add_argument(
+            "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
+        )
+        serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
+        serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
+        serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
+        serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
+        serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
+        serve_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
         serve_parser.set_defaults(func=serve_command_factory)
 
     def __init__(self, pipeline: Pipeline, host: str, port: int):
@@ -87,18 +103,22 @@ class ServeCommand(BaseTransformersCLICommand):
         self._host = host
         self._port = port
         if not _serve_dependancies_installed:
-            raise ImportError("Using serve command requires FastAPI and unicorn. "
-                                "Please install transformers with [serving]: pip install transformers[serving]." 
-                                "Or install FastAPI and unicorn separatly.")
+            raise ImportError(
+                "Using serve command requires FastAPI and unicorn. "
+                "Please install transformers with [serving]: pip install transformers[serving]."
+                "Or install FastAPI and unicorn separatly."
+            )
         else:
-            logger.info('Serving model over {}:{}'.format(host, port))
+            logger.info("Serving model over {}:{}".format(host, port))
             self._app = FastAPI()
 
             # Register routes
-            self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET'])
-            self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST'])
-            self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST'])
-            self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST'])
+            self._app.add_api_route("/", self.model_info, response_model=ServeModelInfoResult, methods=["GET"])
+            self._app.add_api_route("/tokenize", self.tokenize, response_model=ServeTokenizeResult, methods=["POST"])
+            self._app.add_api_route(
+                "/detokenize", self.detokenize, response_model=ServeDeTokenizeResult, methods=["POST"]
+            )
+            self._app.add_api_route("/forward", self.forward, response_model=ServeForwardResult, methods=["POST"])
 
     def run(self):
         run(self._app, host=self._host, port=self._port)
@@ -122,11 +142,14 @@ class ServeCommand(BaseTransformersCLICommand):
                 return ServeTokenizeResult(tokens=tokens_txt)
 
         except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
-
-    def detokenize(self, tokens_ids: List[int] = Body(None, embed=True),
-                   skip_special_tokens: bool = Body(False, embed=True),
-                   cleanup_tokenization_spaces: bool = Body(True, embed=True)):
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
+
+    def detokenize(
+        self,
+        tokens_ids: List[int] = Body(None, embed=True),
+        skip_special_tokens: bool = Body(False, embed=True),
+        cleanup_tokenization_spaces: bool = Body(True, embed=True),
+    ):
         """
         Detokenize the provided tokens ids to readable text:
         - **tokens_ids**: List of tokens ids
@@ -135,9 +158,9 @@ class ServeCommand(BaseTransformersCLICommand):
         """
         try:
             decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
-            return ServeDeTokenizeResult(model='', text=decoded_str)
+            return ServeDeTokenizeResult(model="", text=decoded_str)
         except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
 
     def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)):
         """
diff --git a/transformers/commands/train.py b/transformers/commands/train.py
index 7b2674588..e51be71c7 100644
--- a/transformers/commands/train.py
+++ b/transformers/commands/train.py
@@ -3,9 +3,12 @@ from argparse import ArgumentParser, Namespace
 from logging import getLogger
 
 from transformers.commands import BaseTransformersCLICommand
-from transformers import (is_tf_available, is_torch_available,
-                          TextClassificationPipeline,
-                          SingleSentenceClassificationProcessor as Processor)
+from transformers import (
+    is_tf_available,
+    is_torch_available,
+    TextClassificationPipeline,
+    SingleSentenceClassificationProcessor as Processor,
+)
 
 if not is_tf_available() and not is_torch_available():
     raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
@@ -14,6 +17,7 @@ if not is_tf_available() and not is_torch_available():
 USE_XLA = False
 USE_AMP = False
 
+
 def train_command_factory(args: Namespace):
     """
     Factory function used to instantiate serving server from provided command line arguments.
@@ -23,7 +27,6 @@ def train_command_factory(args: Namespace):
 
 
 class TrainCommand(BaseTransformersCLICommand):
-
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         """
@@ -31,47 +34,54 @@ class TrainCommand(BaseTransformersCLICommand):
         :param parser: Root parser to register command-specific arguments
         :return:
         """
-        train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.')
-
-        train_parser.add_argument('--train_data', type=str, required=True,
-                                  help="path to train (and optionally evaluation) dataset as a csv with "
-                                       "tab separated labels and sentences.")
-        train_parser.add_argument('--column_label', type=int, default=0,
-                                  help='Column of the dataset csv file with example labels.')
-        train_parser.add_argument('--column_text', type=int, default=1,
-                                  help='Column of the dataset csv file with example texts.')
-        train_parser.add_argument('--column_id', type=int, default=2,
-                                  help='Column of the dataset csv file with example ids.')
-        train_parser.add_argument('--skip_first_row', action='store_true',
-                                  help='Skip the first row of the csv file (headers).')
-
-        train_parser.add_argument('--validation_data', type=str, default='',
-                                  help='path to validation dataset.')
-        train_parser.add_argument('--validation_split', type=float, default=0.1,
-                                  help="if validation dataset is not provided, fraction of train dataset "
-                                       "to use as validation dataset.")
-
-        train_parser.add_argument('--output', type=str, default='./',
-                                  help='path to saved the trained model.')
-
-        train_parser.add_argument('--task', type=str, default='text_classification',
-                                  help='Task to train the model on.')
-        train_parser.add_argument('--model', type=str, default='bert-base-uncased',
-                                  help='Model\'s name or path to stored model.')
-        train_parser.add_argument('--train_batch_size', type=int, default=32,
-                                  help='Batch size for training.')
-        train_parser.add_argument('--valid_batch_size', type=int, default=64,
-                                  help='Batch size for validation.')
-        train_parser.add_argument('--learning_rate', type=float, default=3e-5,
-                                  help="Learning rate.")
-        train_parser.add_argument('--adam_epsilon', type=float, default=1e-08,
-                                  help="Epsilon for Adam optimizer.")
+        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
+
+        train_parser.add_argument(
+            "--train_data",
+            type=str,
+            required=True,
+            help="path to train (and optionally evaluation) dataset as a csv with "
+            "tab separated labels and sentences.",
+        )
+        train_parser.add_argument(
+            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
+        )
+        train_parser.add_argument(
+            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
+        )
+        train_parser.add_argument(
+            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
+        )
+        train_parser.add_argument(
+            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
+        )
+
+        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
+        train_parser.add_argument(
+            "--validation_split",
+            type=float,
+            default=0.1,
+            help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
+        )
+
+        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
+
+        train_parser.add_argument(
+            "--task", type=str, default="text_classification", help="Task to train the model on."
+        )
+        train_parser.add_argument(
+            "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
+        )
+        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
+        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
+        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
+        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
         train_parser.set_defaults(func=train_command_factory)
 
     def __init__(self, args: Namespace):
-        self.logger = getLogger('transformers-cli/training')
+        self.logger = getLogger("transformers-cli/training")
 
-        self.framework = 'tf' if is_tf_available() else 'torch'
+        self.framework = "tf" if is_tf_available() else "torch"
 
         os.makedirs(args.output, exist_ok=True)
         assert os.path.isdir(args.output)
@@ -81,28 +91,32 @@ class TrainCommand(BaseTransformersCLICommand):
         self.column_text = args.column_text
         self.column_id = args.column_id
 
-        self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model))
-        if args.task == 'text_classification':
+        self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
+        if args.task == "text_classification":
             self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
-        elif args.task == 'token_classification':
+        elif args.task == "token_classification":
             raise NotImplementedError
-        elif args.task == 'question_answering':
+        elif args.task == "question_answering":
             raise NotImplementedError
 
-        self.logger.info('Loading dataset from {}'.format(args.train_data))
-        self.train_dataset = Processor.create_from_csv(args.train_data,
-                                                       column_label=args.column_label,
-                                                       column_text=args.column_text,
-                                                       column_id=args.column_id,
-                                                       skip_first_row=args.skip_first_row)
+        self.logger.info("Loading dataset from {}".format(args.train_data))
+        self.train_dataset = Processor.create_from_csv(
+            args.train_data,
+            column_label=args.column_label,
+            column_text=args.column_text,
+            column_id=args.column_id,
+            skip_first_row=args.skip_first_row,
+        )
         self.valid_dataset = None
         if args.validation_data:
-            self.logger.info('Loading validation dataset from {}'.format(args.validation_data))
-            self.valid_dataset = Processor.create_from_csv(args.validation_data,
-                                                           column_label=args.column_label,
-                                                           column_text=args.column_text,
-                                                           column_id=args.column_id,
-                                                           skip_first_row=args.skip_first_row)
+            self.logger.info("Loading validation dataset from {}".format(args.validation_data))
+            self.valid_dataset = Processor.create_from_csv(
+                args.validation_data,
+                column_label=args.column_label,
+                column_text=args.column_text,
+                column_id=args.column_id,
+                skip_first_row=args.skip_first_row,
+            )
 
         self.validation_split = args.validation_split
         self.train_batch_size = args.train_batch_size
@@ -111,7 +125,7 @@ class TrainCommand(BaseTransformersCLICommand):
         self.adam_epsilon = args.adam_epsilon
 
     def run(self):
-        if self.framework == 'tf':
+        if self.framework == "tf":
             return self.run_tf()
         return self.run_torch()
 
@@ -119,13 +133,15 @@ class TrainCommand(BaseTransformersCLICommand):
         raise NotImplementedError
 
     def run_tf(self):
-        self.pipeline.fit(self.train_dataset,
-                          validation_data=self.valid_dataset,
-                          validation_split=self.validation_split,
-                          learning_rate=self.learning_rate,
-                          adam_epsilon=self.adam_epsilon,
-                          train_batch_size=self.train_batch_size,
-                          valid_batch_size=self.valid_batch_size)
+        self.pipeline.fit(
+            self.train_dataset,
+            validation_data=self.valid_dataset,
+            validation_split=self.validation_split,
+            learning_rate=self.learning_rate,
+            adam_epsilon=self.adam_epsilon,
+            train_batch_size=self.train_batch_size,
+            valid_batch_size=self.valid_batch_size,
+        )
 
         # Save trained pipeline
         self.pipeline.save_pretrained(self.output)
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index 8e0e56342..d29867d7c 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -9,28 +9,31 @@ from transformers.hf_api import HfApi, HfFolder, HTTPError
 class UserCommands(BaseTransformersCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
-        login_parser = parser.add_parser('login')
+        login_parser = parser.add_parser("login")
         login_parser.set_defaults(func=lambda args: LoginCommand(args))
-        whoami_parser = parser.add_parser('whoami')
+        whoami_parser = parser.add_parser("whoami")
         whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
-        logout_parser = parser.add_parser('logout')
+        logout_parser = parser.add_parser("logout")
         logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
-        list_parser = parser.add_parser('ls')
+        list_parser = parser.add_parser("ls")
         list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
         # upload
-        upload_parser = parser.add_parser('upload')
-        upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
-        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
+        upload_parser = parser.add_parser("upload")
+        upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.")
+        upload_parser.add_argument(
+            "--filename", type=str, default=None, help="Optional: override individual object filename on S3."
+        )
         upload_parser.set_defaults(func=lambda args: UploadCommand(args))
 
 
-
 class ANSI:
     """
     Helper for en.wikipedia.org/wiki/ANSI_escape_code
     """
+
     _bold = u"\u001b[1m"
     _reset = u"\u001b[0m"
+
     @classmethod
     def bold(cls, s):
         return "{}{}{}".format(cls._bold, s, cls._reset)
@@ -44,14 +47,16 @@ class BaseUserCommand:
 
 class LoginCommand(BaseUserCommand):
     def run(self):
-        print("""
+        print(
+            """
         _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
         _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
         _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
         _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
         _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
 
-        """)
+        """
+        )
         username = input("Username: ")
         password = getpass()
         try:
@@ -101,16 +106,10 @@ class ListObjsCommand(BaseUserCommand):
         col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
         row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
         lines = []
-        lines.append(
-            row_format.format(*headers)
-        )
-        lines.append(
-            row_format.format(*["-" * w for w in col_widths])
-        )
+        lines.append(row_format.format(*headers))
+        lines.append(row_format.format(*["-" * w for w in col_widths]))
         for row in rows:
-            lines.append(
-                row_format.format(*row)
-            )
+            lines.append(row_format.format(*row))
         return "\n".join(lines)
 
     def run(self):
@@ -126,15 +125,8 @@ class ListObjsCommand(BaseUserCommand):
         if len(objs) == 0:
             print("No shared file yet")
             exit()
-        rows = [ [
-            obj.filename,
-            obj.LastModified,
-            obj.ETag,
-            obj.Size
-        ] for obj in objs ]
-        print(
-            self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
-        )
+        rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs]
+        print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]))
 
 
 class UploadCommand(BaseUserCommand):
@@ -143,13 +135,7 @@ class UploadCommand(BaseUserCommand):
         Recursively list all files in a folder.
         """
         entries: List[os.DirEntry] = list(os.scandir(rel_path))
-        files = [
-            (
-                os.path.join(os.getcwd(), f.path),  # filepath
-                f.path  # filename
-            )
-            for f in entries if f.is_file()
-        ]
+        files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()]  # filepath  # filename
         for f in entries:
             if f.is_dir():
                 files += self.walk_dir(f.path)
@@ -173,22 +159,14 @@ class UploadCommand(BaseUserCommand):
             raise ValueError("Not a valid file or directory: {}".format(local_path))
 
         for filepath, filename in files:
-            print(
-                "About to upload file {} to S3 under filename {}".format(
-                    ANSI.bold(filepath), ANSI.bold(filename)
-                )
-            )
+            print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename)))
 
         choice = input("Proceed? [Y/n] ").lower()
-        if not(choice == "" or choice == "y" or choice == "yes"):
+        if not (choice == "" or choice == "y" or choice == "yes"):
             print("Abort")
             exit()
-        print(
-            ANSI.bold("Uploading... This might take a while if files are large")
-        )
+        print(ANSI.bold("Uploading... This might take a while if files are large"))
         for filepath, filename in files:
-            access_url = self._api.presign_and_upload(
-                token=token, filename=filename, filepath=filepath
-            )
+            access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath)
             print("Your file now lives at:")
             print(access_url)
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index 6a1ef78dd..dc2b74a29 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -18,16 +18,17 @@
 from .configuration_utils import PretrainedConfig
 
 ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
 }
 
+
 class AlbertConfig(PretrainedConfig):
     """Configuration for `AlbertModel`.
 
@@ -36,22 +37,25 @@ class AlbertConfig(PretrainedConfig):
 
     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30000,
-                 embedding_size=128,
-                 hidden_size=4096,
-                 num_hidden_layers=12,
-                 num_hidden_groups=1,
-                 num_attention_heads=64,
-                 intermediate_size=16384,
-                 inner_group_num=1,
-                 hidden_act="gelu_new",
-                 hidden_dropout_prob=0,
-                 attention_probs_dropout_prob=0,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12, **kwargs):
+    def __init__(
+        self,
+        vocab_size=30000,
+        embedding_size=128,
+        hidden_size=4096,
+        num_hidden_layers=12,
+        num_hidden_groups=1,
+        num_attention_heads=64,
+        intermediate_size=16384,
+        inner_group_num=1,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0,
+        attention_probs_dropout_prob=0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        **kwargs
+    ):
         """Constructs AlbertConfig.
 
         Args:
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 281256389..e4311fc28 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -35,7 +35,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_
 logger = logging.getLogger(__name__)
 
 
-ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -50,8 +51,9 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
         CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class AutoConfig(object):
@@ -79,37 +81,42 @@ class AutoConfig(object):
             - contains `ctrl` : CTRLConfig (CTRL model)
         This class cannot be instantiated using `__init__()` (throw an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoConfig is designed to be instantiated "
-            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError(
+            "AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
+        )
 
     @classmethod
     def for_model(cls, model_type, *args, **kwargs):
-        if 'distilbert' in model_type:
+        if "distilbert" in model_type:
             return DistilBertConfig(*args, **kwargs)
-        elif 'roberta' in model_type:
+        elif "roberta" in model_type:
             return RobertaConfig(*args, **kwargs)
-        elif 'bert' in model_type:
+        elif "bert" in model_type:
             return BertConfig(*args, **kwargs)
-        elif 'openai-gpt' in model_type:
+        elif "openai-gpt" in model_type:
             return OpenAIGPTConfig(*args, **kwargs)
-        elif 'gpt2' in model_type:
+        elif "gpt2" in model_type:
             return GPT2Config(*args, **kwargs)
-        elif 'transfo-xl' in model_type:
+        elif "transfo-xl" in model_type:
             return TransfoXLConfig(*args, **kwargs)
-        elif 'xlnet' in model_type:
+        elif "xlnet" in model_type:
             return XLNetConfig(*args, **kwargs)
-        elif 'xlm' in model_type:
+        elif "xlm" in model_type:
             return XLMConfig(*args, **kwargs)
-        elif 'ctrl' in model_type:
+        elif "ctrl" in model_type:
             return CTRLConfig(*args, **kwargs)
-        elif 'albert' in model_type:
+        elif "albert" in model_type:
             return AlbertConfig(*args, **kwargs)
-        elif 'camembert' in model_type:
+        elif "camembert" in model_type:
             return CamembertConfig(*args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type)
+        )
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
@@ -176,32 +183,36 @@ class AutoConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 7b495013f..7c5ee434a 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -27,27 +27,27 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
-    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
 }
 
 
@@ -82,20 +82,22 @@ class BertConfig(PretrainedConfig):
     """
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        **kwargs
+    ):
         super(BertConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/transformers/configuration_camembert.py b/transformers/configuration_camembert.py
index 3ff64454e..9aa641aa5 100644
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ CamemBERT configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,7 +24,7 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
+    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
 }
 
 
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index f9b9e409e..2726727d4 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
 
 CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
 
+
 class CTRLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `CTRLModel`.
 
@@ -48,6 +49,7 @@ class CTRLConfig(PretrainedConfig):
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
     """
+
     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -64,7 +66,7 @@ class CTRLConfig(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index d9f7cc634..120cbfb9f 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ DistilBERT model configuration """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -26,32 +25,34 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
-    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
+    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
 }
 
 
 class DistilBertConfig(PretrainedConfig):
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30522,
-                 max_position_embeddings=512,
-                 sinusoidal_pos_embds=False,
-                 n_layers=6,
-                 n_heads=12,
-                 dim=768,
-                 hidden_dim=4*768,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 activation='gelu',
-                 initializer_range=0.02,
-                 tie_weights_=True,
-                 qa_dropout=0.1,
-                 seq_classif_dropout=0.2,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        sinusoidal_pos_embds=False,
+        n_layers=6,
+        n_heads=12,
+        dim=768,
+        hidden_dim=4 * 768,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation="gelu",
+        initializer_range=0.02,
+        tie_weights_=True,
+        qa_dropout=0.1,
+        seq_classif_dropout=0.2,
+        **kwargs
+    ):
         super(DistilBertConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index 4c200c076..adc8842ed 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -26,11 +26,14 @@ from .configuration_utils import PretrainedConfig
 
 logger = logging.getLogger(__name__)
 
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
-                                      "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
-                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",
+}
+
 
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
@@ -52,6 +55,7 @@ class GPT2Config(PretrainedConfig):
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
     """
+
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -67,7 +71,7 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_mmbt.py b/transformers/configuration_mmbt.py
index 60176c987..5dad2babe 100644
--- a/transformers/configuration_mmbt.py
+++ b/transformers/configuration_mmbt.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ MMBT configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,6 +30,7 @@ class MMBTConfig(object):
         num_labels: Size of final Linear layer for classification.
         modal_hidden_size: Embedding dimension of the non-text modality encoder.
     """
+
     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
         self.__dict__ = config.__dict__
         self.modal_hidden_size = modal_hidden_size
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 7776a0bb9..53929aab5 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -30,6 +30,7 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 }
 
+
 class OpenAIGPTConfig(PretrainedConfig):
     """
     Configuration class to store the configuration of a `OpenAIGPTModel`.
@@ -54,6 +55,7 @@ class OpenAIGPTConfig(PretrainedConfig):
             initializing all weight matrices.
         predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
+
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -71,7 +73,7 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
-        summary_type='cls_index',
+        summary_type="cls_index",
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/transformers/configuration_roberta.py b/transformers/configuration_roberta.py
index 842edac56..3b8ddd1c4 100644
--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ RoBERTa configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,12 +24,12 @@ from .configuration_bert import BertConfig
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
-    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
-    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
+    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
+    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
 }
 
 
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 377a0919d..4584015e2 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -27,11 +27,11 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 }
 
 
@@ -65,19 +65,21 @@ class T5Config(PretrainedConfig):
     """
     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=32128,
-                 n_positions=512,
-                 d_model=512,
-                 d_kv=64,
-                 d_ff=2048,
-                 num_layers=6,
-                 num_heads=8,
-                 relative_attention_num_buckets=32,
-                 dropout_rate=0.1,
-                 layer_norm_epsilon=1e-6,
-                 initializer_factor=1.0,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=32128,
+        n_positions=512,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        **kwargs
+    ):
         super(T5Config, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index 52f0f45a5..a2a7c5c02 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -27,9 +27,10 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
 
+
 class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
 
@@ -65,38 +66,41 @@ class TransfoXLConfig(PretrainedConfig):
             proj_init_std: parameters initialized by N(0, init_std)
             init_std: parameters initialized by N(0, init_std)
     """
+
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=267735,
-                 cutoffs=[20000, 40000, 200000],
-                 d_model=1024,
-                 d_embed=1024,
-                 n_head=16,
-                 d_head=64,
-                 d_inner=4096,
-                 div_val=4,
-                 pre_lnorm=False,
-                 n_layer=18,
-                 tgt_len=128,
-                 ext_len=0,
-                 mem_len=1600,
-                 clamp_len=1000,
-                 same_length=True,
-                 proj_share_all_but_first=True,
-                 attn_type=0,
-                 sample_softmax=-1,
-                 adaptive=True,
-                 tie_weight=True,
-                 dropout=0.1,
-                 dropatt=0.0,
-                 untie_r=True,
-                 init="normal",
-                 init_range=0.01,
-                 proj_init_std=0.01,
-                 init_std=0.02,
-                 layer_norm_epsilon=1e-5,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=267735,
+        cutoffs=[20000, 40000, 200000],
+        d_model=1024,
+        d_embed=1024,
+        n_head=16,
+        d_head=64,
+        d_inner=4096,
+        div_val=4,
+        pre_lnorm=False,
+        n_layer=18,
+        tgt_len=128,
+        ext_len=0,
+        mem_len=1600,
+        clamp_len=1000,
+        same_length=True,
+        proj_share_all_but_first=True,
+        attn_type=0,
+        sample_softmax=-1,
+        adaptive=True,
+        tie_weight=True,
+        dropout=0.1,
+        dropatt=0.0,
+        untie_r=True,
+        init="normal",
+        init_range=0.01,
+        proj_init_std=0.01,
+        init_std=0.02,
+        layer_norm_epsilon=1e-5,
+        **kwargs
+    ):
         """Constructs TransfoXLConfig.
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index d2d6ee5d8..f29899175 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ Configuration base class and utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -28,6 +27,7 @@ from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 
 logger = logging.getLogger(__name__)
 
+
 class PretrainedConfig(object):
     r""" Base class for all configuration classes.
         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
@@ -50,36 +50,36 @@ class PretrainedConfig(object):
 
     def __init__(self, **kwargs):
         # Attributes with defaults
-        self.output_attentions = kwargs.pop('output_attentions', False)
-        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
-        self.output_past = kwargs.pop('output_past', True)  # Not used by all models
-        self.torchscript = kwargs.pop('torchscript', False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
-        self.pruned_heads = kwargs.pop('pruned_heads', {})
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_past = kwargs.pop("output_past", True)  # Not used by all models
+        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
 
         # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_decoder = kwargs.pop('is_decoder', False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
 
         # Parameters for sequence generation
-        self.max_length = kwargs.pop('max_length', 20)
-        self.do_sample = kwargs.pop('do_sample', False)
-        self.num_beams = kwargs.pop('num_beams', 1)
-        self.temperature = kwargs.pop('temperature', 1.0)
-        self.top_k = kwargs.pop('top_k', 50)
-        self.top_p = kwargs.pop('top_p', 1.0)
-        self.repetition_penalty = kwargs.pop('repetition_penalty', 1.0)
-        self.bos_token_id = kwargs.pop('bos_token_id', 0)
-        self.pad_token_id = kwargs.pop('pad_token_id', 0)
-        self.eos_token_ids = kwargs.pop('eos_token_ids', 0)
-        self.length_penalty = kwargs.pop('length_penalty', 1.)
-        self.num_return_sequences = kwargs.pop('num_return_sequences', 1)
+        self.max_length = kwargs.pop("max_length", 20)
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.bos_token_id = kwargs.pop("bos_token_id", 0)
+        self.pad_token_id = kwargs.pop("pad_token_id", 0)
+        self.eos_token_ids = kwargs.pop("eos_token_ids", 0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
 
         # Fine-tuning task arguments
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
-        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.num_labels = kwargs.pop("num_labels", 2)
+        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
         self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
         self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
 
         # Additional attributes without default values
@@ -94,7 +94,9 @@ class PretrainedConfig(object):
         """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_config_file = os.path.join(save_directory, CONFIG_NAME)
@@ -153,11 +155,11 @@ class PretrainedConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
@@ -170,37 +172,48 @@ class PretrainedConfig(object):
 
         try:
             # Load from URL or cache if already cached
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
-                                               proxies=proxies, resume_download=resume_download)
+            resolved_config_file = cached_path(
+                config_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+            )
             # Load config
             config = cls.from_json_file(resolved_config_file)
 
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file)
+                    config_file
+                )
             else:
-                msg = "Model name '{}' was not found in model name list ({}). " \
-                      "We assumed '{}' was a path or url to a configuration file named {} or " \
-                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                msg = (
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url to a configuration file named {} or "
+                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
                         pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file, CONFIG_NAME)
+                        ", ".join(cls.pretrained_config_archive_map.keys()),
+                        config_file,
+                        CONFIG_NAME,
+                    )
+                )
             raise EnvironmentError(msg)
 
         except json.JSONDecodeError:
-            msg = "Couldn't reach server at '{}' to download configuration file or " \
-                  "configuration file is not a valid JSON file. " \
-                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            msg = (
+                "Couldn't reach server at '{}' to download configuration file or "
+                "configuration file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            )
             raise EnvironmentError(msg)
 
         if resolved_config_file == config_file:
             logger.info("loading configuration file {}".format(config_file))
         else:
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
+            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
 
-        if hasattr(config, 'pruned_heads'):
+        if hasattr(config, "pruned_heads"):
             config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
 
         # Update config with kwargs if needed
@@ -226,7 +239,7 @@ class PretrainedConfig(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `Config` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
         dict_obj = json.loads(text)
         return cls(**dict_obj)
@@ -248,5 +261,5 @@ class PretrainedConfig(object):
 
     def to_json_file(self, json_file_path):
         """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
+        with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index 727f31977..a98024e9e 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -25,16 +25,16 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
 }
 
 
@@ -78,41 +78,44 @@ class XLMConfig(PretrainedConfig):
             -1 means no clamping.
         same_length: bool, whether to use the same attention length for each token.
     """
+
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=30145,
-                 emb_dim=2048,
-                 n_layers=12,
-                 n_heads=16,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 gelu_activation=True,
-                 sinusoidal_embeddings=False,
-                 causal=False,
-                 asm=False,
-                 n_langs=1,
-                 use_lang_emb=True,
-                 max_position_embeddings=512,
-                 embed_init_std=2048 ** -0.5,
-                 layer_norm_eps=1e-12,
-                 init_std=0.02,
-                 bos_index=0,
-                 eos_index=1,
-                 pad_index=2,
-                 unk_index=3,
-                 mask_index=5,
-                 is_encoder=True,
-                 summary_type='first',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 mask_token_id=0,
-                 lang_id=0,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048 ** -0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        **kwargs
+    ):
         """Constructs XLMConfig.
         """
         super(XLMConfig, self).__init__(**kwargs)
diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index 5b6955f4f..fcf5c571d 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ XLM-RoBERTa configuration """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -25,12 +24,12 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
+    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
+    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
+    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
+    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
+    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
+    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
 }
 
 
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 017c57cfd..8768aeac9 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -26,8 +26,8 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
 
 
@@ -69,32 +69,35 @@ class XLNetConfig(PretrainedConfig):
         same_length: bool, whether to use the same attention length for each token.
         finetuning_task: name of the glue task on which the model was fine-tuned if any
     """
+
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-    def __init__(self,
-                 vocab_size=32000,
-                 d_model=1024,
-                 n_layer=24,
-                 n_head=16,
-                 d_inner=4096,
-                 ff_activation="gelu",
-                 untie_r=True,
-                 attn_type="bi",
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 dropout=0.1,
-                 mem_len=None,
-                 reuse_len=None,
-                 bi_data=False,
-                 clamp_len=-1,
-                 same_length=False,
-                 summary_type='last',
-                 summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_last_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=32000,
+        d_model=1024,
+        n_layer=24,
+        n_head=16,
+        d_inner=4096,
+        ff_activation="gelu",
+        untie_r=True,
+        attn_type="bi",
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        dropout=0.1,
+        mem_len=None,
+        reuse_len=None,
+        bi_data=False,
+        clamp_len=-1,
+        same_length=False,
+        summary_type="last",
+        summary_use_proj=True,
+        summary_activation="tanh",
+        summary_last_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        **kwargs
+    ):
         """Constructs XLNetConfig.
         """
         super(XLNetConfig, self).__init__(**kwargs)
diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
index b6476b4fb..733f6fc5c 100644
--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -24,6 +24,7 @@ import torch
 from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -44,24 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--albert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained ALBERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--albert_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained ALBERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.albert_config_file,
-                                     args.pytorch_dump_path)
- 
\ No newline at end of file
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index 75808811e..9393068b1 100755
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = BertConfig.from_json_file(bert_config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--bert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--bert_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained BERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.bert_config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
index 35866caac..304c63450 100644
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 from transformers import BertModel
 
 
-def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
+def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
 
     """
     :param model:BertModel Pytorch model instance to be converted
@@ -41,22 +41,17 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         N BertForQuestionAnswering
     """
 
-    tensors_to_transpose = (
-        "dense.weight",
-        "attention.self.query",
-        "attention.self.key",
-        "attention.self.value"
-    )
+    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
 
     var_map = (
-        ('layer.', 'layer_'),
-        ('word_embeddings.weight', 'word_embeddings'),
-        ('position_embeddings.weight', 'position_embeddings'),
-        ('token_type_embeddings.weight', 'token_type_embeddings'),
-        ('.', '/'),
-        ('LayerNorm/weight', 'LayerNorm/gamma'),
-        ('LayerNorm/bias', 'LayerNorm/beta'),
-        ('weight', 'kernel')
+        ("layer.", "layer_"),
+        ("word_embeddings.weight", "word_embeddings"),
+        ("position_embeddings.weight", "position_embeddings"),
+        ("token_type_embeddings.weight", "token_type_embeddings"),
+        (".", "/"),
+        ("LayerNorm/weight", "LayerNorm/gamma"),
+        ("LayerNorm/bias", "LayerNorm/beta"),
+        ("weight", "kernel"),
     )
 
     if not os.path.isdir(ckpt_dir):
@@ -64,12 +59,12 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
 
     state_dict = model.state_dict()
 
-    def to_tf_var_name(name:str):
+    def to_tf_var_name(name: str):
         for patt, repl in iter(var_map):
             name = name.replace(patt, repl)
-        return 'bert/{}'.format(name)
+        return "bert/{}".format(name)
 
-    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
+    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
         session.run(tf.variables_initializer([tf_var]))
@@ -94,37 +89,22 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
 
 def main(raw_args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name",
-                        type=str,
-                        required=True,
-                        help="model name e.g. bert-base-uncased")
-    parser.add_argument("--cache_dir",
-                        type=str,
-                        default=None,
-                        required=False,
-                        help="Directory containing pytorch model")
-    parser.add_argument("--pytorch_model_path",
-                        type=str,
-                        required=True,
-                        help="/path/to/<pytorch-model-name>.bin")
-    parser.add_argument("--tf_cache_dir",
-                        type=str,
-                        required=True,
-                        help="Directory in which to save tensorflow model")
+    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
+    parser.add_argument(
+        "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
+    )
+    parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
+    parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
     args = parser.parse_args(raw_args)
-    
+
     model = BertModel.from_pretrained(
         pretrained_model_name_or_path=args.model_name,
         state_dict=torch.load(args.pytorch_model_path),
-        cache_dir=args.cache_dir
-    )
-    
-    convert_pytorch_checkpoint_to_tf(
-        model=model,
-        ckpt_dir=args.tf_cache_dir,
-        model_name=args.model_name
+        cache_dir=args.cache_dir,
     )
 
+    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
+
 
 if __name__ == "__main__":
     main()
diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index e2328c08c..eeafdb81e 100755
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,12 +21,10 @@ from io import open
 
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                     GPT2Config,
-                                                     GPT2Model,
-                                                     load_tf_weights_in_gpt2)
+from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -42,8 +40,8 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model.state_dict(), pytorch_weights_dump_path)
     print("Save configuration file to {}".format(pytorch_config_dump_path))
@@ -54,22 +52,18 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--gpt2_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--gpt2_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-                            "This specifies the model architecture.")
+    parser.add_argument(
+        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--gpt2_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
+        "This specifies the model architecture.",
+    )
     args = parser.parse_args()
-    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
-                                         args.gpt2_config_file,
-                                         args.pytorch_dump_folder_path)
+    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index 13ebecf2f..c87bb9d59 100755
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,12 +21,10 @@ from io import open
 
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                     OpenAIGPTConfig,
-                                                     OpenAIGPTModel,
-                                                     load_tf_weights_in_openai_gpt)
+from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -42,8 +40,8 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model.state_dict(), pytorch_weights_dump_path)
     print("Save configuration file to {}".format(pytorch_config_dump_path))
@@ -54,22 +52,24 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--openai_checkpoint_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--openai_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-                            "This specifies the model architecture.")
+    parser.add_argument(
+        "--openai_checkpoint_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the TensorFlow checkpoint path.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--openai_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
+        "This specifies the model architecture.",
+    )
     args = parser.parse_args()
-    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
-                                         args.openai_config_file,
-                                         args.pytorch_dump_folder_path)
+    convert_openai_checkpoint_to_pytorch(
+        args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
+    )
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 0edac6fb7..c7ad66e13 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -24,82 +24,270 @@ import tensorflow as tf
 
 from transformers import is_torch_available, cached_path
 
-from transformers import (load_pytorch_checkpoint_in_tf2_model,
-    BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  GPT2Config, TFGPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  XLNetConfig, TFXLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  XLMConfig, TFXLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from transformers import (
+    load_pytorch_checkpoint_in_tf2_model,
+    BertConfig,
+    TFBertForPreTraining,
+    TFBertForQuestionAnswering,
+    TFBertForSequenceClassification,
+    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    GPT2Config,
+    TFGPT2LMHeadModel,
+    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLNetConfig,
+    TFXLNetLMHeadModel,
+    XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLMConfig,
+    TFXLMWithLMHeadModel,
+    XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    TransfoXLConfig,
+    TFTransfoXLLMHeadModel,
+    TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    OpenAIGPTConfig,
+    TFOpenAIGPTLMHeadModel,
+    OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    RobertaConfig,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
+    ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    DistilBertConfig,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForSequenceClassification,
+    DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    CTRLConfig,
+    TFCTRLLMHeadModel,
+    CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    AlbertConfig,
+    TFAlbertForMaskedLM,
+    ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    T5Config,
+    TFT5WithLMHeadModel,
+    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+)
 
 if is_torch_available():
     import torch
     import numpy as np
-    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DistilBertForMaskedLM,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 else:
-    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
-        None, None, None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None,
-        None, None, None,
-        None, None, None, None,
-        None, None,
-        None, None,
-        None, None)
+    (
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DistilBertForMaskedLM,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    ) = (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'gpt2': (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'xlm': (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'roberta': (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    "bert": (
+        BertConfig,
+        TFBertForPreTraining,
+        BertForPreTraining,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-large-uncased-whole-word-masking-finetuned-squad": (
+        BertConfig,
+        TFBertForQuestionAnswering,
+        BertForQuestionAnswering,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-large-cased-whole-word-masking-finetuned-squad": (
+        BertConfig,
+        TFBertForQuestionAnswering,
+        BertForQuestionAnswering,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "bert-base-cased-finetuned-mrpc": (
+        BertConfig,
+        TFBertForSequenceClassification,
+        BertForSequenceClassification,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "gpt2": (
+        GPT2Config,
+        TFGPT2LMHeadModel,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlnet": (
+        XLNetConfig,
+        TFXLNetLMHeadModel,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlm": (
+        XLMConfig,
+        TFXLMWithLMHeadModel,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "transfo-xl": (
+        TransfoXLConfig,
+        TFTransfoXLLMHeadModel,
+        TransfoXLLMHeadModel,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "openai-gpt": (
+        OpenAIGPTConfig,
+        TFOpenAIGPTLMHeadModel,
+        OpenAIGPTLMHeadModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "roberta": (
+        RobertaConfig,
+        TFRobertaForMaskedLM,
+        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "roberta-large-mnli": (
+        RobertaConfig,
+        TFRobertaForSequenceClassification,
+        RobertaForSequenceClassification,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert": (
+        DistilBertConfig,
+        TFDistilBertForMaskedLM,
+        DistilBertForMaskedLM,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert-base-uncased-distilled-squad": (
+        DistilBertConfig,
+        TFDistilBertForQuestionAnswering,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "distilbert-base-uncased-distilled-squad": (
+        DistilBertConfig,
+        TFDistilBertForQuestionAnswering,
+        DistilBertForQuestionAnswering,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "ctrl": (
+        CTRLConfig,
+        TFCTRLLMHeadModel,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "albert": (
+        AlbertConfig,
+        TFAlbertForMaskedLM,
+        AlbertForMaskedLM,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "t5": (
+        T5Config,
+        TFT5WithLMHeadModel,
+        T5WithLMHeadModel,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }
 
-def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
+
+def convert_pt_checkpoint_to_tf(
+    model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True
+):
     if model_type not in MODEL_CLASSES:
         raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
 
@@ -116,17 +304,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
     # Load weights from tf checkpoint
     if pytorch_checkpoint_path in aws_model_maps:
-        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
+        pytorch_checkpoint_path = cached_path(
+            aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models
+        )
     # Load PyTorch checkpoint in tf2 model:
     tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
 
     if compare_with_pt_model:
         tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
-        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
-        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
-                                                  config=config,
-                                                  state_dict=state_dict)
+        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+        pt_model = pt_model_class.from_pretrained(
+            pretrained_model_name_or_path=None, config=config, state_dict=state_dict
+        )
 
         with torch.no_grad():
             pto = pt_model(**pt_model.dummy_inputs)
@@ -139,11 +329,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
     # Save pytorch-model
     print("Save TensorFlow model to {}".format(tf_dump_path))
-    tf_model.save_weights(tf_dump_path, save_format='h5')
+    tf_model.save_weights(tf_dump_path, save_format="h5")
 
 
-def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
+def convert_all_pt_checkpoints_to_tf(
+    args_model_type,
+    tf_dump_path,
+    model_shortcut_names_or_path=None,
+    config_shortcut_names_or_path=None,
+    compare_with_pt_model=False,
+    use_cached_models=False,
+    remove_cached_files=False,
+    only_convert_finetuned_models=False,
+):
     assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
 
     if args_model_type is None:
@@ -156,7 +354,9 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
         print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
         print("=" * 100)
         if model_type not in MODEL_CLASSES:
-            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
+            raise ValueError(
+                "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys()))
+            )
 
         config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
 
@@ -166,9 +366,10 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
             config_shortcut_names_or_path = model_shortcut_names_or_path
 
         for i, (model_shortcut_name, config_shortcut_name) in enumerate(
-                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
+            zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1
+        ):
             print("-" * 100)
-            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
+            if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name:
                 if not only_convert_finetuned_models:
                     print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
                     continue
@@ -176,7 +377,11 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
             elif only_convert_finetuned_models:
                 print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
                 continue
-            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
+            print(
+                "    Converting checkpoint {}/{}: {} - model_type {}".format(
+                    i, len(aws_config_map), model_shortcut_name, model_type
+                )
+            )
             print("-" * 100)
 
             if config_shortcut_name in aws_config_map:
@@ -190,13 +395,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
                 model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
 
             if os.path.isfile(model_shortcut_name):
-                model_shortcut_name = 'converted_model'
+                model_shortcut_name = "converted_model"
 
-            convert_pt_checkpoint_to_tf(model_type=model_type,
-                                        pytorch_checkpoint_path=model_file,
-                                        config_file=config_file,
-                                        tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
-                                        compare_with_pt_model=compare_with_pt_model)
+            convert_pt_checkpoint_to_tf(
+                model_type=model_type,
+                pytorch_checkpoint_path=model_file,
+                config_file=config_file,
+                tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"),
+                compare_with_pt_model=compare_with_pt_model,
+            )
             if remove_cached_files:
                 os.remove(config_file)
                 os.remove(model_file)
@@ -205,39 +412,47 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output Tensorflow dump file.")
-    parser.add_argument("--model_type",
-                        default = None,
-                        type = str,
-                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
-    parser.add_argument("--pytorch_checkpoint_path",
-                        default = None,
-                        type = str,
-                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
-                               "If not given, will download and convert all the checkpoints from AWS.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        help = "The config json file corresponding to the pre-trained model. \n"
-                               "This specifies the model architecture. If not given and "
-                               "--pytorch_checkpoint_path is not given or is a shortcut name"
-                               "use the configuration associated to the shortcut name on the AWS")
-    parser.add_argument("--compare_with_pt_model",
-                        action='store_true',
-                        help = "Compare Tensorflow and PyTorch model predictions.")
-    parser.add_argument("--use_cached_models",
-                        action='store_true',
-                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
-    parser.add_argument("--remove_cached_files",
-                        action='store_true',
-                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
-    parser.add_argument("--only_convert_finetuned_models",
-                        action='store_true',
-                        help = "Only convert finetuned models.")
+    parser.add_argument(
+        "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file."
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(
+            list(MODEL_CLASSES.keys())
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_checkpoint_path",
+        default=None,
+        type=str,
+        help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
+        "If not given, will download and convert all the checkpoints from AWS.",
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture. If not given and "
+        "--pytorch_checkpoint_path is not given or is a shortcut name"
+        "use the configuration associated to the shortcut name on the AWS",
+    )
+    parser.add_argument(
+        "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions."
+    )
+    parser.add_argument(
+        "--use_cached_models",
+        action="store_true",
+        help="Use cached models if possible instead of updating to latest checkpoint versions.",
+    )
+    parser.add_argument(
+        "--remove_cached_files",
+        action="store_true",
+        help="Remove pytorch models after conversion (save memory when converting in batches).",
+    )
+    parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.")
     args = parser.parse_args()
 
     # if args.pytorch_checkpoint_path is not None:
@@ -248,11 +463,15 @@ if __name__ == "__main__":
     #                                 compare_with_pt_model=args.compare_with_pt_model,
     #                                 use_cached_models=args.use_cached_models)
     # else:
-    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
-                                        args.tf_dump_path,
-                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
-                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
-                                        compare_with_pt_model=args.compare_with_pt_model,
-                                        use_cached_models=args.use_cached_models,
-                                        remove_cached_files=args.remove_cached_files,
-                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
+    convert_all_pt_checkpoints_to_tf(
+        args.model_type.lower() if args.model_type is not None else None,
+        args.tf_dump_path,
+        model_shortcut_names_or_path=[args.pytorch_checkpoint_path]
+        if args.pytorch_checkpoint_path is not None
+        else None,
+        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
+        compare_with_pt_model=args.compare_with_pt_model,
+        use_cached_models=args.use_cached_models,
+        remove_cached_files=args.remove_cached_files,
+        only_convert_finetuned_models=args.only_convert_finetuned_models,
+    )
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index fedfc1ecb..3dec4882f 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -30,20 +30,27 @@ if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from transformers.modeling_bert import (BertConfig, BertEncoder,
-                                        BertIntermediate, BertLayer,
-                                        BertModel, BertOutput,
-                                        BertSelfAttention,
-                                        BertSelfOutput)
-from transformers.modeling_roberta import (RobertaEmbeddings,
-                                           RobertaForMaskedLM,
-                                           RobertaForSequenceClassification,
-                                           RobertaModel)
+from transformers.modeling_bert import (
+    BertConfig,
+    BertEncoder,
+    BertIntermediate,
+    BertLayer,
+    BertModel,
+    BertOutput,
+    BertSelfAttention,
+    BertSelfOutput,
+)
+from transformers.modeling_roberta import (
+    RobertaEmbeddings,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaModel,
+)
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
+SAMPLE_TEXT = "Hello world! cécé herlolip"
 
 
 def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
@@ -61,7 +68,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         intermediate_size=roberta.args.encoder_ffn_embed_dim,
         max_position_embeddings=514,
         type_vocab_size=1,
-        layer_norm_eps=1e-5, # PyTorch default used in fairseq
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
     )
     if classification_head:
         config.num_labels = roberta.args.num_classes
@@ -74,7 +81,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     # Embeddings
     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c RoBERTa doesn't use them.
     model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
     model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
 
@@ -85,11 +94,11 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### self attention
         self_attn: BertSelfAttention = layer.attention.self
-        assert(
-            roberta_layer.self_attn.k_proj.weight.data.shape == \
-            roberta_layer.self_attn.q_proj.weight.data.shape == \
-            roberta_layer.self_attn.v_proj.weight.data.shape == \
-            torch.Size((config.hidden_size, config.hidden_size))
+        assert (
+            roberta_layer.self_attn.k_proj.weight.data.shape
+            == roberta_layer.self_attn.q_proj.weight.data.shape
+            == roberta_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
         )
 
         self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
@@ -101,9 +110,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### self-attention output
         self_output: BertSelfOutput = layer.attention.output
-        assert(
-            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        )
+        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
         self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
         self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
@@ -111,28 +118,24 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
         ### intermediate
         intermediate: BertIntermediate = layer.intermediate
-        assert(
-            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        )
+        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
         intermediate.dense.weight = roberta_layer.fc1.weight
         intermediate.dense.bias = roberta_layer.fc1.bias
 
         ### output
         bert_output: BertOutput = layer.output
-        assert(
-            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        )
+        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
         bert_output.dense.weight = roberta_layer.fc2.weight
         bert_output.dense.bias = roberta_layer.fc2.bias
         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
         #### end of layer
-    
+
     if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
+        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
     else:
         # LM Head
         model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
@@ -143,21 +146,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
 
     our_output = model(input_ids)[0]
     if classification_head:
-        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
+        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
     else:
         their_output = roberta.model(input_ids)[0]
     print(our_output.shape, their_output.shape)
     max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
     success = torch.allclose(our_output, their_output, atol=1e-3)
-    print(
-        "Do both models output the same tensors?",
-        "🔥" if success else "💩"
-    )
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
     if not success:
         raise Exception("Something went wRoNg")
 
@@ -169,23 +169,16 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--roberta_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the official PyTorch dump.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--classification_head",
-                        action = "store_true",
-                        help = "Whether to convert a final classification head.")
+    parser.add_argument(
+        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
     args = parser.parse_args()
     convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.classification_head
+        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
     )
-
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
index 2b74d2dd9..0b22a5f9c 100755
--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -24,8 +24,10 @@ import torch
 from transformers import T5Config, T5Model, load_tf_weights_in_t5
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = T5Config.from_json_file(config_file)
@@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained T5 model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained T5 model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.config_file,
-                                     args.pytorch_dump_path)
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index a5ff4ed22..f8dd45ae5 100755
--- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -26,9 +26,8 @@ import torch
 import transformers.tokenization_transfo_xl as data_utils
 
 from transformers import CONFIG_NAME, WEIGHTS_NAME
-from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
-                                                      load_tf_weights_in_transfo_xl)
-from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
+from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -36,32 +35,33 @@ else:
     import pickle
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
 # We do this to be able to load python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 data_utils.Vocab = data_utils.TransfoXLTokenizer
 data_utils.Corpus = data_utils.TransfoXLCorpus
-sys.modules['data_utils'] = data_utils
-sys.modules['vocabulary'] = data_utils
+sys.modules["data_utils"] = data_utils
+sys.modules["vocabulary"] = data_utils
+
 
-def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
-                                             transfo_xl_config_file,
-                                             pytorch_dump_folder_path,
-                                             transfo_xl_dataset_file):
+def convert_transfo_xl_checkpoint_to_pytorch(
+    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
+):
     if transfo_xl_dataset_file:
         # Convert a pre-processed corpus (see original TensorFlow repo)
         with open(transfo_xl_dataset_file, "rb") as fp:
             corpus = pickle.load(fp, encoding="latin1")
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
         corpus_vocab_dict = corpus.vocab.__dict__
         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 
         corpus_dict_no_vocab = corpus.__dict__
-        corpus_dict_no_vocab.pop('vocab', None)
-        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        corpus_dict_no_vocab.pop("vocab", None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
         print("Save dataset to {}".format(pytorch_dataset_dump_path))
         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 
@@ -92,26 +92,36 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
-    parser.add_argument("--tf_checkpoint_path",
-                        default = "",
-                        type = str,
-                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
-    parser.add_argument("--transfo_xl_config_file",
-                        default = "",
-                        type = str,
-                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--transfo_xl_dataset_file",
-                        default = "",
-                        type = str,
-                        help = "An optional dataset file to be converted in a vocabulary.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to store the PyTorch model or dataset/vocab.",
+    )
+    parser.add_argument(
+        "--tf_checkpoint_path",
+        default="",
+        type=str,
+        help="An optional path to a TensorFlow checkpoint path to be converted.",
+    )
+    parser.add_argument(
+        "--transfo_xl_config_file",
+        default="",
+        type=str,
+        help="An optional config json file corresponding to the pre-trained BERT model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--transfo_xl_dataset_file",
+        default="",
+        type=str,
+        help="An optional dataset file to be converted in a vocabulary.",
+    )
     args = parser.parse_args()
-    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.transfo_xl_config_file,
-                                     args.pytorch_dump_folder_path,
-                                     args.transfo_xl_dataset_file)
+    convert_transfo_xl_checkpoint_to_pytorch(
+        args.tf_checkpoint_path,
+        args.transfo_xl_config_file,
+        args.pytorch_dump_folder_path,
+        args.transfo_xl_dataset_file,
+    )
diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 91133ef56..7cbf9cae9 100755
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -27,32 +27,34 @@ from transformers import CONFIG_NAME, WEIGHTS_NAME
 from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
+
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
     # Load checkpoint
-    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
+    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
 
-    state_dict = chkpt['model']
+    state_dict = chkpt["model"]
 
     # We have the base model one level deeper than the original XLM repository
     two_levels_state_dict = {}
     for k, v in state_dict.items():
-        if 'pred_layer' in k:
+        if "pred_layer" in k:
             two_levels_state_dict[k] = v
         else:
-            two_levels_state_dict['transformer.' + k] = v
+            two_levels_state_dict["transformer." + k] = v
 
-    config = chkpt['params']
+    config = chkpt["params"]
     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
 
-    vocab = chkpt['dico_word2id']
-    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
+    vocab = chkpt["dico_word2id"]
+    vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
 
     # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
 
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
@@ -69,15 +71,11 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--xlm_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the official PyTorch dump.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
+    parser.add_argument(
+        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 3669d9944..83688cf07 100755
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,11 +22,15 @@ import os
 import argparse
 import torch
 
-from transformers import (CONFIG_NAME, WEIGHTS_NAME,
-                                                    XLNetConfig,
-                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
-                                                    XLNetForSequenceClassification,
-                                                    load_tf_weights_in_xlnet)
+from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    XLNetConfig,
+    XLNetLMHeadModel,
+    XLNetForQuestionAnswering,
+    XLNetForSequenceClassification,
+    load_tf_weights_in_xlnet,
+)
 
 GLUE_TASKS_NUM_LABELS = {
     "cola": 2,
@@ -41,9 +45,13 @@ GLUE_TASKS_NUM_LABELS = {
 }
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 
-def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
+
+def convert_xlnet_checkpoint_to_pytorch(
+    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
+):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
 
@@ -53,7 +61,7 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
         config.finetuning_task = finetuning_task
         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
         model = XLNetForSequenceClassification(config)
-    elif 'squad' in finetuning_task:
+    elif "squad" in finetuning_task:
         config.finetuning_task = finetuning_task
         model = XLNetForQuestionAnswering(config)
     else:
@@ -75,30 +83,33 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--xlnet_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained XLNet model. \n"
-                               "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
-    parser.add_argument("--finetuning_task",
-                        default = None,
-                        type = str,
-                        help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--xlnet_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained XLNet model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to store the PyTorch model or dataset/vocab.",
+    )
+    parser.add_argument(
+        "--finetuning_task",
+        default=None,
+        type=str,
+        help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
+    )
     args = parser.parse_args()
     print(args)
 
-    convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                        args.xlnet_config_file,
-                                        args.pytorch_dump_folder_path,
-                                        args.finetuning_task)
+    convert_xlnet_checkpoint_to_pytorch(
+        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
+    )
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index 5567952fd..bac6c6e3a 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,8 +1,15 @@
-from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor
+from .processors import (
+    InputExample,
+    InputFeatures,
+    DataProcessor,
+    SquadFeatures,
+    SingleSentenceClassificationProcessor,
+)
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
 from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
 
 from .metrics import is_sklearn_available
+
 if is_sklearn_available():
     from .metrics import glue_compute_metrics, xnli_compute_metrics
diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
index 5a46eb05d..bd3b76efc 100644
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -23,20 +23,22 @@ logger = logging.getLogger(__name__)
 try:
     from scipy.stats import pearsonr, spearmanr
     from sklearn.metrics import matthews_corrcoef, f1_score
+
     _has_sklearn = True
 except (AttributeError, ImportError) as e:
     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
     _has_sklearn = False
 
+
 def is_sklearn_available():
     return _has_sklearn
 
+
 if _has_sklearn:
 
     def simple_accuracy(preds, labels):
         return (preds == labels).mean()
 
-
     def acc_and_f1(preds, labels):
         acc = simple_accuracy(preds, labels)
         f1 = f1_score(y_true=labels, y_pred=preds)
@@ -46,7 +48,6 @@ if _has_sklearn:
             "acc_and_f1": (acc + f1) / 2,
         }
 
-
     def pearson_and_spearman(preds, labels):
         pearson_corr = pearsonr(preds, labels)[0]
         spearman_corr = spearmanr(preds, labels)[0]
@@ -56,7 +57,6 @@ if _has_sklearn:
             "corr": (pearson_corr + spearman_corr) / 2,
         }
 
-
     def glue_compute_metrics(task_name, preds, labels):
         assert len(preds) == len(labels)
         if task_name == "cola":
@@ -82,7 +82,6 @@ if _has_sklearn:
         else:
             raise KeyError(task_name)
 
-
     def xnli_compute_metrics(task_name, preds, labels):
         assert len(preds) == len(labels)
         if task_name == "xnli":
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index acbb884fb..a867fe3fd 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -24,19 +24,21 @@ logger = logging.getLogger(__name__)
 
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
+
     def remove_articles(text):
-        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-        return re.sub(regex, ' ', text)
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
         exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
 
     def lower(text):
         return text.lower()
+
     return white_space_fix(remove_articles(remove_punc(lower(s))))
 
 
@@ -75,14 +77,14 @@ def get_raw_scores(examples, preds):
 
     for example in examples:
         qas_id = example.qas_id
-        gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
 
         if not gold_answers:
             # For unanswerable questions, only correct answer is empty string
-            gold_answers = ['']
+            gold_answers = [""]
 
         if qas_id not in preds:
-            print('Missing prediction for %s' % qas_id)
+            print("Missing prediction for %s" % qas_id)
             continue
 
         prediction = preds[qas_id]
@@ -106,23 +108,27 @@ def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
 def make_eval_dict(exact_scores, f1_scores, qid_list=None):
     if not qid_list:
         total = len(exact_scores)
-        return collections.OrderedDict([
-            ('exact', 100.0 * sum(exact_scores.values()) / total),
-            ('f1', 100.0 * sum(f1_scores.values()) / total),
-            ('total', total),
-        ])
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
     else:
         total = len(qid_list)
-        return collections.OrderedDict([
-            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-            ('total', total),
-        ])
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
 
 
 def merge_eval(main_eval, new_eval, prefix):
     for k in new_eval:
-        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
 
 
 def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
@@ -160,16 +166,14 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
 
 
 def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
-        preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
-        preds, f1_raw, na_probs, qid_to_has_ans)
-    main_eval['best_exact'] = best_exact
-    main_eval['best_exact_thresh'] = exact_thresh
-    main_eval['best_f1'] = best_f1
-    main_eval['best_f1_thresh'] = f1_thresh
-    main_eval['has_ans_exact'] = has_ans_exact
-    main_eval['has_ans_f1'] = has_ans_f1
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
 
 
 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
@@ -199,10 +203,10 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_h
     best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
     best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
 
-    main_eval['best_exact'] = best_exact
-    main_eval['best_exact_thresh'] = exact_thresh
-    main_eval['best_f1'] = best_f1
-    main_eval['best_f1_thresh'] = f1_thresh
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
 
 
 def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
@@ -215,18 +219,20 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
 
     exact, f1 = get_raw_scores(examples, preds)
 
-    exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
     f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
 
     evaluation = make_eval_dict(exact_threshold, f1_threshold)
 
     if has_answer_qids:
         has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
-        merge_eval(evaluation, has_ans_eval, 'HasAns')
+        merge_eval(evaluation, has_ans_eval, "HasAns")
 
     if no_answer_qids:
         no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
-        merge_eval(evaluation, no_ans_eval, 'NoAns')
+        merge_eval(evaluation, no_ans_eval, "NoAns")
 
     if no_answer_probs:
         find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
@@ -284,8 +290,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -294,8 +299,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -326,7 +330,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
             logger.info("Couldn't map end position")
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
@@ -393,8 +397,8 @@ def compute_predictions_logits(
         unique_id_to_result[result.unique_id] = result
 
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
 
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
@@ -447,7 +451,9 @@ def compute_predictions_logits(
                             start_index=start_index,
                             end_index=end_index,
                             start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
         if version_2_with_negative:
             prelim_predictions.append(
                 _PrelimPrediction(
@@ -455,14 +461,14 @@ def compute_predictions_logits(
                     start_index=0,
                     end_index=0,
                     start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
 
         seen_predictions = {}
         nbest = []
@@ -471,10 +477,10 @@ def compute_predictions_logits(
                 break
             feature = features[pred.feature_index]
             if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
 
                 tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
@@ -498,31 +504,21 @@ def compute_predictions_logits(
                 final_text = ""
                 seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
         # if we didn't include the empty option in the n-best, include it
         if version_2_with_negative:
             if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
 
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
             if len(nbest) == 1:
-                nbest.insert(0,
-                             _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         assert len(nbest) >= 1
 
@@ -551,8 +547,7 @@ def compute_predictions_logits(
             all_predictions[example.qas_id] = nbest_json[0]["text"]
         else:
             # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
             scores_diff_json[example.qas_id] = score_diff
             if score_diff > null_score_diff_threshold:
                 all_predictions[example.qas_id] = ""
@@ -586,7 +581,7 @@ def compute_predictions_log_probs(
     end_n_top,
     version_2_with_negative,
     tokenizer,
-    verbose_logging
+    verbose_logging,
 ):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
@@ -594,12 +589,12 @@ def compute_predictions_log_probs(
         Requires utils_squad_evaluate.py
     """
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index",
-         "start_log_prob", "end_log_prob"])
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
 
     logger.info("Writing predictions to: %s", output_prediction_file)
     # logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -663,12 +658,13 @@ def compute_predictions_log_probs(
                             start_index=start_index,
                             end_index=end_index,
                             start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
+                            end_log_prob=end_log_prob,
+                        )
+                    )
 
         prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
-            reverse=True)
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
 
         seen_predictions = {}
         nbest = []
@@ -688,10 +684,10 @@ def compute_predictions_log_probs(
             # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
 
             # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
             orig_doc_start = feature.token_to_orig_map[pred.start_index]
             orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
             tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
 
             # Clean whitespace
@@ -704,8 +700,7 @@ def compute_predictions_log_probs(
             else:
                 do_lower_case = tokenizer.do_lowercase_and_remove_accent
 
-            final_text = get_final_text(tok_text, orig_text, do_lower_case,
-                                        verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
@@ -713,17 +708,13 @@ def compute_predictions_log_probs(
             seen_predictions[final_text] = True
 
             nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
-            nbest.append(
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
 
         total_scores = []
         best_non_null_entry = None
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 4f7307bb7..e59e9fbcb 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
 from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
-from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 11ebd949d..f9c0132a7 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -27,15 +27,18 @@ if is_tf_available():
 logger = logging.getLogger(__name__)
 
 
-def glue_convert_examples_to_features(examples, tokenizer,
-                                      max_length=512,
-                                      task=None,
-                                      label_list=None,
-                                      output_mode=None,
-                                      pad_on_left=False,
-                                      pad_token=0,
-                                      pad_token_segment_id=0,
-                                      mask_padding_with_zero=True):
+def glue_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=512,
+    task=None,
+    label_list=None,
+    output_mode=None,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
     """
     Loads a data file into a list of ``InputFeatures``
 
@@ -82,12 +85,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
             example = processor.get_example_from_tensor_dict(example)
             example = processor.tfds_map(example)
 
-        inputs = tokenizer.encode_plus(
-            example.text_a,
-            example.text_b,
-            add_special_tokens=True,
-            max_length=max_length,
-        )
+        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
         input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
@@ -106,8 +104,12 @@ def glue_convert_examples_to_features(examples, tokenizer,
             token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
 
         assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
 
         if output_mode == "classification":
             label = label_map[example.label]
@@ -125,28 +127,36 @@ def glue_convert_examples_to_features(examples, tokenizer,
             logger.info("label: %s (id = %d)" % (example.label, label))
 
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              attention_mask=attention_mask,
-                              token_type_ids=token_type_ids,
-                              label=label))
+            InputFeatures(
+                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
+            )
+        )
 
     if is_tf_available() and is_tf_dataset:
+
         def gen():
             for ex in features:
-                yield ({'input_ids': ex.input_ids,
-                         'attention_mask': ex.attention_mask,
-                         'token_type_ids': ex.token_type_ids},
-                        ex.label)
-
-        return tf.data.Dataset.from_generator(gen,
-            ({'input_ids': tf.int32,
-              'attention_mask': tf.int32,
-              'token_type_ids': tf.int32},
-             tf.int64),
-            ({'input_ids': tf.TensorShape([None]),
-              'attention_mask': tf.TensorShape([None]),
-              'token_type_ids': tf.TensorShape([None])},
-             tf.TensorShape([])))
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    },
+                    ex.label,
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                tf.TensorShape([]),
+            ),
+        )
 
     return features
 
@@ -156,21 +166,21 @@ class MrpcProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
         logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -186,8 +196,7 @@ class MrpcProcessor(DataProcessor):
             text_a = line[3]
             text_b = line[4]
             label = line[0]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -196,21 +205,20 @@ class MnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['premise'].numpy().decode('utf-8'),
-                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
 
     def get_labels(self):
         """See base class."""
@@ -226,8 +234,7 @@ class MnliProcessor(DataProcessor):
             text_a = line[8]
             text_b = line[9]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -236,9 +243,7 @@ class MnliMismatchedProcessor(MnliProcessor):
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
 
 
 class ColaProcessor(DataProcessor):
@@ -246,20 +251,20 @@ class ColaProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            None,
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -272,8 +277,7 @@ class ColaProcessor(DataProcessor):
             guid = "%s-%s" % (set_type, i)
             text_a = line[3]
             label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -282,20 +286,20 @@ class Sst2Processor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            None,
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -310,8 +314,7 @@ class Sst2Processor(DataProcessor):
             guid = "%s-%s" % (set_type, i)
             text_a = line[0]
             label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -320,20 +323,20 @@ class StsbProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -349,8 +352,7 @@ class StsbProcessor(DataProcessor):
             text_a = line[7]
             text_b = line[8]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -359,20 +361,20 @@ class QqpProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['question1'].numpy().decode('utf-8'),
-                            tensor_dict['question2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question1"].numpy().decode("utf-8"),
+            tensor_dict["question2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -391,8 +393,7 @@ class QqpProcessor(DataProcessor):
                 label = line[5]
             except IndexError:
                 continue
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -401,21 +402,20 @@ class QnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['question'].numpy().decode('utf-8'),
-                            tensor_dict['sentence'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question"].numpy().decode("utf-8"),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
 
     def get_labels(self):
         """See base class."""
@@ -431,8 +431,7 @@ class QnliProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -441,20 +440,20 @@ class RteProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -470,8 +469,7 @@ class RteProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -480,20 +478,20 @@ class WnliProcessor(DataProcessor):
 
     def get_example_from_tensor_dict(self, tensor_dict):
         """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
-                            tensor_dict['sentence1'].numpy().decode('utf-8'),
-                            tensor_dict['sentence2'].numpy().decode('utf-8'),
-                            str(tensor_dict['label'].numpy()))
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -509,10 +507,10 @@ class WnliProcessor(DataProcessor):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
+
 glue_tasks_num_labels = {
     "cola": 2,
     "mnli": 3,
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index fd5150e93..efb10830b 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -82,8 +82,8 @@ def _is_whitespace(c):
         return True
     return False
 
-def squad_convert_example_to_features(example, max_seq_length,
-                                       doc_stride, max_query_length, is_training):
+
+def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
     features = []
     if is_training and not example.is_impossible:
         # Get start and end position
@@ -91,7 +91,7 @@ def squad_convert_example_to_features(example, max_seq_length,
         end_position = example.end_position
 
         # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
         cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
         if actual_text.find(cleaned_answer_text) == -1:
             logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
@@ -121,8 +121,11 @@ def squad_convert_example_to_features(example, max_seq_length,
     spans = []
 
     truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-    sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \
-        if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence
+    sequence_added_tokens = (
+        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+        if "roberta" in str(type(tokenizer))
+        else tokenizer.max_len - tokenizer.max_len_single_sentence
+    )
     sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
 
     span_doc_tokens = all_doc_tokens
@@ -135,16 +138,18 @@ def squad_convert_example_to_features(example, max_seq_length,
             return_overflowing_tokens=True,
             pad_to_max_length=True,
             stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
         )
 
-        paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride,
-                            max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
 
-        if tokenizer.pad_token_id in encoded_dict['input_ids']:
-            non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
         else:
-            non_padded_ids = encoded_dict['input_ids']
+            non_padded_ids = encoded_dict["input_ids"]
 
         tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
@@ -170,17 +175,20 @@ def squad_convert_example_to_features(example, max_seq_length,
     for doc_span_index in range(len(spans)):
         for j in range(spans[doc_span_index]["paragraph_len"]):
             is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = j if tokenizer.padding_side == "left" else spans[doc_span_index][
-                                                                   "truncated_query_with_special_tokens_length"] + j
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
             spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
     for span in spans:
         # Identify the position of the CLS token
-        cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
 
         # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
         # Original TF implem also keep the classification token (set to 0) (not sure why...)
-        p_mask = np.array(span['token_type_ids'])
+        p_mask = np.array(span["token_type_ids"])
 
         p_mask = np.minimum(p_mask, 1)
 
@@ -219,31 +227,34 @@ def squad_convert_example_to_features(example, max_seq_length,
                 start_position = tok_start_position - doc_start + doc_offset
                 end_position = tok_end_position - doc_start + doc_offset
 
-        features.append(SquadFeatures(
-            span['input_ids'],
-            span['attention_mask'],
-            span['token_type_ids'],
-            cls_index,
-            p_mask.tolist(),
-            example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing.
-            unique_id=0,
-            paragraph_len=span['paragraph_len'],
-            token_is_max_context=span["token_is_max_context"],
-            tokens=span["tokens"],
-            token_to_orig_map=span["token_to_orig_map"],
-
-            start_position=start_position,
-            end_position=end_position
-        ))
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+            )
+        )
     return features
 
+
 def squad_convert_example_to_features_init(tokenizer_for_convert):
     global tokenizer
     tokenizer = tokenizer_for_convert
 
-def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training, 
-                                       return_dataset=False, threads=1):
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
+):
     """
     Converts a list of examples into a list of features that can be directly given as input to a model.
     It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -279,17 +290,28 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         )
     """
 
-    # Defining helper methods    
+    # Defining helper methods
     features = []
     threads = min(threads, cpu_count())
     with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length,
-                                       doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training)
-        features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features'))
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+            )
+        )
     new_features = []
     unique_id = 1000000000
     example_index = 0
-    for example_features in tqdm(features, total=len(features), desc='add example index and unique id'):
+    for example_features in tqdm(features, total=len(features), desc="add example index and unique id"):
         if not example_features:
             continue
         for example_feature in example_features:
@@ -300,7 +322,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         example_index += 1
     features = new_features
     del new_features
-    if return_dataset == 'pt':
+    if return_dataset == "pt":
         if not is_torch_available():
             raise ImportError("Pytorch must be installed to return a pytorch dataset.")
 
@@ -341,12 +363,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                         "input_ids": ex.input_ids,
                         "attention_mask": ex.attention_mask,
                         "token_type_ids": ex.token_type_ids,
-                    }, {
+                    },
+                    {
                         "start_position": ex.start_position,
                         "end_position": ex.end_position,
                         "cls_index": ex.cls_index,
                         "p_mask": ex.p_mask,
-                    }
+                    },
                 )
 
         return tf.data.Dataset.from_generator(
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index ee234e6e9..41cc00d4b 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -24,6 +24,7 @@ from ...file_utils import is_tf_available, is_torch_available
 
 logger = logging.getLogger(__name__)
 
+
 class InputExample(object):
     """
     A single training/test example for simple sequence classification.
@@ -37,6 +38,7 @@ class InputExample(object):
         label: (Optional) string. The label of the example. This should be
         specified for train and dev examples, but not for test examples.
     """
+
     def __init__(self, guid, text_a, text_b=None, label=None):
         self.guid = guid
         self.text_a = text_a
@@ -99,14 +101,15 @@ class DataProcessor(object):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                 lines.append(line)
             return lines
 
 
 class SingleSentenceClassificationProcessor(DataProcessor):
     """ Generic processor for a single sentence classification data set."""
-    def __init__(self, labels=None, examples=None, mode='classification', verbose=False):
+
+    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
         self.labels = [] if labels is None else labels
         self.examples = [] if examples is None else examples
         self.mode = mode
@@ -117,22 +120,24 @@ class SingleSentenceClassificationProcessor(DataProcessor):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return SingleSentenceClassificationProcessor(labels=self.labels,
-                                                         examples=self.examples[idx])
+            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
         return self.examples[idx]
 
     @classmethod
-    def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1,
-                        column_id=None, skip_first_row=False, **kwargs):
+    def create_from_csv(
+        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
+    ):
         processor = cls(**kwargs)
-        processor.add_examples_from_csv(file_name,
-                                        split_name=split_name,
-                                        column_label=column_label,
-                                        column_text=column_text,
-                                        column_id=column_id,
-                                        skip_first_row=skip_first_row,
-                                        overwrite_labels=True,
-                                        overwrite_examples=True)
+        processor.add_examples_from_csv(
+            file_name,
+            split_name=split_name,
+            column_label=column_label,
+            column_text=column_text,
+            column_id=column_id,
+            skip_first_row=skip_first_row,
+            overwrite_labels=True,
+            overwrite_examples=True,
+        )
         return processor
 
     @classmethod
@@ -141,8 +146,17 @@ class SingleSentenceClassificationProcessor(DataProcessor):
         processor.add_examples(texts_or_text_and_labels, labels=labels)
         return processor
 
-    def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None,
-                              skip_first_row=False, overwrite_labels=False, overwrite_examples=False):
+    def add_examples_from_csv(
+        self,
+        file_name,
+        split_name="",
+        column_label=0,
+        column_text=1,
+        column_id=None,
+        skip_first_row=False,
+        overwrite_labels=False,
+        overwrite_examples=False,
+    ):
         lines = self._read_tsv(file_name)
         if skip_first_row:
             lines = lines[1:]
@@ -158,10 +172,13 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
                 ids.append(guid)
 
-        return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples)
+        return self.add_examples(
+            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
+        )
 
-    def add_examples(self, texts_or_text_and_labels, labels=None, ids=None,
-                     overwrite_labels=False, overwrite_examples=False):
+    def add_examples(
+        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
+    ):
         assert labels is None or len(texts_or_text_and_labels) == len(labels)
         assert ids is None or len(texts_or_text_and_labels) == len(ids)
         if ids is None:
@@ -192,13 +209,15 @@ class SingleSentenceClassificationProcessor(DataProcessor):
 
         return self.examples
 
-    def get_features(self,
-                     tokenizer,
-                     max_length=None,
-                     pad_on_left=False,
-                     pad_token=0,
-                     mask_padding_with_zero=True,
-                     return_tensors=None):
+    def get_features(
+        self,
+        tokenizer,
+        max_length=None,
+        pad_on_left=False,
+        pad_token=0,
+        mask_padding_with_zero=True,
+        return_tensors=None,
+    ):
         """
         Convert examples in a list of ``InputFeatures``
 
@@ -231,9 +250,7 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 logger.info("Tokenizing example %d", ex_index)
 
             input_ids = tokenizer.encode(
-                example.text_a,
-                add_special_tokens=True,
-                max_length=min(max_length, tokenizer.max_len),
+                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
             )
             all_input_ids.append(input_ids)
 
@@ -256,8 +273,12 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 input_ids = input_ids + ([pad_token] * padding_length)
                 attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
 
-            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length)
-            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length)
+            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(
+                len(input_ids), batch_length
+            )
+            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(
+                len(attention_mask), batch_length
+            )
 
             if self.mode == "classification":
                 label = label_map[example.label]
@@ -273,36 +294,31 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                 logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
                 logger.info("label: %s (id = %d)" % (example.label, label))
 
-            features.append(
-                    InputFeatures(input_ids=input_ids,
-                                  attention_mask=attention_mask,
-                                  label=label))
+            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
 
         if return_tensors is None:
             return features
-        elif return_tensors == 'tf':
+        elif return_tensors == "tf":
             if not is_tf_available():
                 raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
             import tensorflow as tf
+
             def gen():
                 for ex in features:
-                    yield  ({'input_ids': ex.input_ids,
-                            'attention_mask': ex.attention_mask},
-                            ex.label)
-
-            dataset = tf.data.Dataset.from_generator(gen,
-                    ({'input_ids': tf.int32,
-                    'attention_mask': tf.int32},
-                    tf.int64),
-                    ({'input_ids': tf.TensorShape([None]),
-                    'attention_mask': tf.TensorShape([None])},
-                    tf.TensorShape([])))
+                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
+
+            dataset = tf.data.Dataset.from_generator(
+                gen,
+                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
+                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
+            )
             return dataset
-        elif return_tensors == 'pt':
+        elif return_tensors == "pt":
             if not is_torch_available():
                 raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported")
             import torch
             from torch.utils.data import TensorDataset
+
             all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
             all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
             if self.mode == "classification":
diff --git a/transformers/data/processors/xnli.py b/transformers/data/processors/xnli.py
index 958bdf62f..ffe0358c1 100644
--- a/transformers/data/processors/xnli.py
+++ b/transformers/data/processors/xnli.py
@@ -24,11 +24,12 @@ from .utils import DataProcessor, InputExample
 
 logger = logging.getLogger(__name__)
 
+
 class XnliProcessor(DataProcessor):
     """Processor for the XNLI dataset.
     Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
 
-    def __init__(self, language, train_language = None):
+    def __init__(self, language, train_language=None):
         self.language = language
         self.train_language = train_language
 
@@ -40,13 +41,12 @@ class XnliProcessor(DataProcessor):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % ('train', i)
+            guid = "%s-%s" % ("train", i)
             text_a = line[0]
             text_b = line[1]
             label = "contradiction" if line[2] == "contradictory" else line[2]
             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
     def get_test_examples(self, data_dir):
@@ -59,19 +59,19 @@ class XnliProcessor(DataProcessor):
             language = line[0]
             if language != self.language:
                 continue
-            guid = "%s-%s" % ('test', i)
+            guid = "%s-%s" % ("test", i)
             text_a = line[6]
             text_b = line[7]
             label = line[1]
             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
     def get_labels(self):
         """See base class."""
         return ["contradiction", "entailment", "neutral"]
 
+
 xnli_processors = {
     "xnli": XnliProcessor,
 }
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index ec925c616..c45bdee04 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -3,7 +3,7 @@ Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
-from __future__ import (absolute_import, division, print_function, unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -29,9 +29,10 @@ from filelock import FileLock
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 try:
-    os.environ.setdefault('USE_TORCH', 'YES')
-    if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
+    os.environ.setdefault("USE_TORCH", "YES")
+    if os.environ["USE_TORCH"].upper() in ("1", "ON", "YES"):
         import torch
+
         _torch_available = True  # pylint: disable=invalid-name
         logger.info("PyTorch version {} available.".format(torch.__version__))
     else:
@@ -41,10 +42,11 @@ except ImportError:
     _torch_available = False  # pylint: disable=invalid-name
 
 try:
-    os.environ.setdefault('USE_TF', 'YES')
-    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
+    os.environ.setdefault("USE_TF", "YES")
+    if os.environ["USE_TF"].upper() in ("1", "ON", "YES"):
         import tensorflow as tf
-        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+
+        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
         _tf_available = True  # pylint: disable=invalid-name
         logger.info("TensorFlow version {} available.".format(tf.__version__))
     else:
@@ -55,12 +57,13 @@ except (ImportError, AssertionError):
 
 try:
     from torch.hub import _get_torch_home
+
     torch_cache_home = _get_torch_home()
 except ImportError:
     torch_cache_home = os.path.expanduser(
-        os.getenv('TORCH_HOME', os.path.join(
-            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'transformers')
+        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+    )
+default_cache_path = os.path.join(torch_cache_home, "transformers")
 
 try:
     from urllib.parse import urlparse
@@ -69,19 +72,21 @@ except ImportError:
 
 try:
     from pathlib import Path
+
     PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
+        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
+    )
 except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
-                                              os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                        default_cache_path))
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+    )
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
 WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = 'tf_model.h5'
-TF_WEIGHTS_NAME = 'model.ckpt'
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF_WEIGHTS_NAME = "model.ckpt"
 CONFIG_NAME = "config.json"
 MODEL_CARD_NAME = "modelcard.json"
 
@@ -95,38 +100,48 @@ CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
 def is_torch_available():
     return _torch_available
 
+
 def is_tf_available():
 
     return _tf_available
 
+
 if not six.PY2:
+
     def add_start_docstrings(*docstr):
         def docstring_decorator(fn):
-            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            fn.__doc__ = "".join(docstr) + fn.__doc__
             return fn
+
         return docstring_decorator
 
     def add_end_docstrings(*docstr):
         def docstring_decorator(fn):
-            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            fn.__doc__ = fn.__doc__ + "".join(docstr)
             return fn
+
         return docstring_decorator
+
+
 else:
     # Not possible to update class docstrings on python2
     def add_start_docstrings(*docstr):
         def docstring_decorator(fn):
             return fn
+
         return docstring_decorator
 
     def add_end_docstrings(*docstr):
         def docstring_decorator(fn):
             return fn
+
         return docstring_decorator
 
 
 def is_remote_url(url_or_filename):
     parsed = urlparse(url_or_filename)
-    return parsed.scheme in ('http', 'https', 's3')
+    return parsed.scheme in ("http", "https", "s3")
+
 
 def hf_bucket_url(identifier, postfix=None, cdn=False):
     endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
@@ -145,17 +160,17 @@ def url_to_filename(url, etag=None):
     so that TF 2.0 can identify it as a HDF5 file
     (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
     """
-    url_bytes = url.encode('utf-8')
+    url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
 
     if etag:
-        etag_bytes = etag.encode('utf-8')
+        etag_bytes = etag.encode("utf-8")
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += "." + etag_hash.hexdigest()
 
-    if url.endswith('.h5'):
-        filename += '.h5'
+    if url.endswith(".h5"):
+        filename += ".h5"
 
     return filename
 
@@ -174,19 +189,21 @@ def filename_to_url(filename, cache_dir=None):
     if not os.path.exists(cache_path):
         raise EnvironmentError("file {} not found".format(cache_path))
 
-    meta_path = cache_path + '.json'
+    meta_path = cache_path + ".json"
     if not os.path.exists(meta_path):
         raise EnvironmentError("file {} not found".format(meta_path))
 
     with open(meta_path, encoding="utf-8") as meta_file:
         metadata = json.load(meta_file)
-    url = metadata['url']
-    etag = metadata['etag']
+    url = metadata["url"]
+    etag = metadata["etag"]
 
     return url, etag
 
 
-def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None):
+def cached_path(
+    url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None
+):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
@@ -207,13 +224,18 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
 
     if is_remote_url(url_or_filename):
         # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir=cache_dir,
-            force_download=force_download, proxies=proxies,
-            resume_download=resume_download, user_agent=user_agent)
+        return get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+        )
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
-    elif urlparse(url_or_filename).scheme == '':
+    elif urlparse(url_or_filename).scheme == "":
         # File, but it doesn't exist.
         raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
@@ -273,31 +295,35 @@ def s3_get(url, temp_file, proxies=None):
 def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
     ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
     if isinstance(user_agent, dict):
-        ua += "; " + "; ".join(
-            "{}/{}".format(k, v) for k, v in user_agent.items()
-        )
+        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
     elif isinstance(user_agent, six.string_types):
-        ua += "; "+ user_agent
-    headers = {
-        "user-agent": ua
-    }
+        ua += "; " + user_agent
+    headers = {"user-agent": ua}
     if resume_size > 0:
-        headers['Range'] = 'bytes=%d-' % (resume_size,)
+        headers["Range"] = "bytes=%d-" % (resume_size,)
     response = requests.get(url, stream=True, proxies=proxies, headers=headers)
     if response.status_code == 416:  # Range not satisfiable
         return
-    content_length = response.headers.get('Content-Length')
+    content_length = response.headers.get("Content-Length")
     total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size,
-                    desc="Downloading", disable=bool(logger.level<=logging.INFO))
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc="Downloading",
+        disable=bool(logger.level <= logging.INFO),
+    )
     for chunk in response.iter_content(chunk_size=1024):
-        if chunk: # filter out keep-alive new chunks
+        if chunk:  # filter out keep-alive new chunks
             progress.update(len(chunk))
             temp_file.write(chunk)
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None):
+def get_from_cache(
+    url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None
+):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -326,7 +352,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
             etag = None
 
     if sys.version_info[0] == 2 and etag is not None:
-        etag = etag.decode('utf-8')
+        etag = etag.decode("utf-8")
     filename = url_to_filename(url, etag)
 
     # get cache path to put the file
@@ -337,22 +363,24 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
     if not os.path.exists(cache_path) and etag is None:
         matching_files = [
             file
-            for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*')
-            if not file.endswith('.json') and not file.endswith('.lock')
+            for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+            if not file.endswith(".json") and not file.endswith(".lock")
         ]
         if matching_files:
             cache_path = os.path.join(cache_dir, matching_files[-1])
 
     # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + '.lock'
+    lock_path = cache_path + ".lock"
     with FileLock(lock_path):
 
         if resume_download:
-            incomplete_path = cache_path + '.incomplete'
+            incomplete_path = cache_path + ".incomplete"
+
             @contextmanager
             def _resumable_file_manager():
-                with open(incomplete_path,'a+b') as f:
+                with open(incomplete_path, "a+b") as f:
                     yield f
+
             temp_file_manager = _resumable_file_manager
             if os.path.exists(incomplete_path):
                 resume_size = os.stat(incomplete_path).st_size
@@ -366,7 +394,9 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
             # Download to temporary file, then copy to cache dir once finished.
             # Otherwise you get corrupt cache entries if the download gets interrupted.
             with temp_file_manager() as temp_file:
-                logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+                logger.info(
+                    "%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name
+                )
 
                 # GET file object
                 if url.startswith("s3://"):
@@ -383,12 +413,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
                 os.rename(temp_file.name, cache_path)
 
                 logger.info("creating metadata file for %s", cache_path)
-                meta = {'url': url, 'etag': etag}
-                meta_path = cache_path + '.json'
-                with open(meta_path, 'w') as meta_file:
+                meta = {"url": url, "etag": etag}
+                meta_path = cache_path + ".json"
+                with open(meta_path, "w") as meta_file:
                     output_string = json.dumps(meta)
                     if sys.version_info[0] == 2 and isinstance(output_string, str):
-                        output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                        output_string = unicode(output_string, "utf-8")  # The beauty of python 2
                     meta_file.write(output_string)
 
     return cache_path
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 170732339..81cc9f7eb 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -24,13 +24,14 @@ from tqdm import tqdm
 
 ENDPOINT = "https://huggingface.co"
 
+
 class S3Obj:
     def __init__(
         self,
-        filename,     # type: str
-        LastModified, # type: str
-        ETag,         # type: str
-        Size,         # type: int
+        filename,  # type: str
+        LastModified,  # type: str
+        ETag,  # type: str
+        Size,  # type: int
         **kwargs
     ):
         self.filename = filename
@@ -43,13 +44,13 @@ class PresignedUrl:
     def __init__(
         self,
         write,  # type: str
-        access, # type: str
-        type,   # type: str
+        access,  # type: str
+        type,  # type: str
         **kwargs
     ):
         self.write = write
         self.access = access
-        self.type = type # mime-type to send to S3.
+        self.type = type  # mime-type to send to S3.
 
 
 class HfApi:
@@ -58,8 +59,8 @@ class HfApi:
 
     def login(
         self,
-        username, # type: str
-        password, # type: str
+        username,  # type: str
+        password,  # type: str
     ):
         # type: (...) -> str
         """
@@ -78,8 +79,7 @@ class HfApi:
         return d["token"]
 
     def whoami(
-        self,
-        token, # type: str
+        self, token,  # type: str
     ):
         # type: (...) -> str
         """
@@ -106,11 +106,7 @@ class HfApi:
         Call HF API to get a presigned url to upload `filename` to S3.
         """
         path = "{}/api/presign".format(self.endpoint)
-        r = requests.post(
-            path,
-            headers={"authorization": "Bearer {}".format(token)},
-            json={"filename": filename},
-        )
+        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename},)
         r.raise_for_status()
         d = r.json()
         return PresignedUrl(**d)
@@ -126,16 +122,14 @@ class HfApi:
         urls = self.presign(token, filename=filename)
         # streaming upload:
         # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
-        # 
+        #
         # Even though we presign with the correct content-type,
         # the client still has to specify it when uploading the file.
         with open(filepath, "rb") as f:
             pf = TqdmProgressFileReader(f)
             data = f if pf.total_size > 0 else ""
 
-            r = requests.put(urls.write, data=data, headers={
-                "content-type": urls.type,
-            })
+            r = requests.put(urls.write, data=data, headers={"content-type": urls.type,})
             r.raise_for_status()
             pf.close()
         return urls.access
@@ -152,7 +146,6 @@ class HfApi:
         return [S3Obj(**x) for x in d]
 
 
-
 class TqdmProgressFileReader:
     """
     Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
@@ -161,12 +154,12 @@ class TqdmProgressFileReader:
     see github.com/huggingface/transformers/pull/2078#discussion_r354739608
     for implementation details.
     """
+
     def __init__(
-        self,
-        f   # type: io.BufferedReader
+        self, f  # type: io.BufferedReader
     ):
         self.f = f
-        self.total_size = os.fstat(f.fileno()).st_size # type: int
+        self.total_size = os.fstat(f.fileno()).st_size  # type: int
         self.pbar = tqdm(total=self.total_size, leave=False)
         if six.PY3:
             # does not work unless PY3
@@ -182,7 +175,6 @@ class TqdmProgressFileReader:
         self.pbar.close()
 
 
-
 class HfFolder:
     path_token = expanduser("~/.huggingface/token")
 
@@ -201,7 +193,7 @@ class HfFolder:
                 if e.errno != os.errno.EEXIST:
                     raise e
                 pass
-        with open(cls.path_token, 'w+') as f:
+        with open(cls.path_token, "w+") as f:
             f.write(token)
 
     @classmethod
@@ -210,7 +202,7 @@ class HfFolder:
         Get token or None if not existent.
         """
         try:
-            with open(cls.path_token, 'r') as f:
+            with open(cls.path_token, "r") as f:
                 return f.read()
         except:
             # this is too wide. When Py2 is dead use:
diff --git a/transformers/modelcard.py b/transformers/modelcard.py
index 4a879235a..e6b1982e9 100644
--- a/transformers/modelcard.py
+++ b/transformers/modelcard.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """ Configuration base class and utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -25,8 +24,15 @@ from io import open
 
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
-                        cached_path, is_remote_url, hf_bucket_url
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    WEIGHTS_NAME,
+    TF2_WEIGHTS_NAME,
+    cached_path,
+    is_remote_url,
+    hf_bucket_url,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -48,17 +54,18 @@ class ModelCard(object):
 
         Parameters:
     """
+
     def __init__(self, **kwargs):
         # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
-        self.model_details = kwargs.pop('model_details', {})
-        self.intended_use = kwargs.pop('intended_use', {})
-        self.factors = kwargs.pop('factors', {})
-        self.metrics = kwargs.pop('metrics', {})
-        self.evaluation_data = kwargs.pop('evaluation_data', {})
-        self.training_data = kwargs.pop('training_data', {})
-        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
-        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
-        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+        self.model_details = kwargs.pop("model_details", {})
+        self.intended_use = kwargs.pop("intended_use", {})
+        self.factors = kwargs.pop("factors", {})
+        self.metrics = kwargs.pop("metrics", {})
+        self.evaluation_data = kwargs.pop("evaluation_data", {})
+        self.training_data = kwargs.pop("training_data", {})
+        self.quantitative_analyses = kwargs.pop("quantitative_analyses", {})
+        self.ethical_considerations = kwargs.pop("ethical_considerations", {})
+        self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {})
 
         # Open additional attributes
         for key, value in kwargs.items():
@@ -122,10 +129,10 @@ class ModelCard(object):
             modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
 
         """
-        cache_dir = kwargs.pop('cache_dir', None)
-        proxies = kwargs.pop('proxies', None)
-        find_from_standard_name = kwargs.pop('find_from_standard_name', True)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+        cache_dir = kwargs.pop("cache_dir", None)
+        proxies = kwargs.pop("proxies", None)
+        find_from_standard_name = kwargs.pop("find_from_standard_name", True)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
         if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
             # For simplicity we use the same pretrained url than the configuration files
@@ -145,36 +152,43 @@ class ModelCard(object):
 
         try:
             # Load from URL or cache if already cached
-            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
-                                               proxies=proxies, resume_download=False)
+            resolved_model_card_file = cached_path(
+                model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False
+            )
             if resolved_model_card_file == model_card_file:
                 logger.info("loading model card file {}".format(model_card_file))
             else:
-                logger.info("loading model card file {} from cache at {}".format(
-                    model_card_file, resolved_model_card_file))
+                logger.info(
+                    "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file)
+                )
             # Load model card
             modelcard = cls.from_json_file(resolved_model_card_file)
 
         except EnvironmentError:
             if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
-                        model_card_file))
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
             else:
-                logger.warning("Model name '{}' was not found in model name list ({}). " \
-                      "We assumed '{}' was a path or url to a model card file named {} or " \
-                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                logger.warning(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url to a model card file named {} or "
+                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
                         pretrained_model_name_or_path,
-                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        model_card_file, MODEL_CARD_NAME))
+                        ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file,
+                        MODEL_CARD_NAME,
+                    )
+                )
             logger.warning("Creating an empty model card.")
 
             # We fall back on creating an empty model card
             modelcard = cls()
 
         except json.JSONDecodeError:
-            logger.warning("Couldn't reach server at '{}' to download model card file or "
-                           "model card file is not a valid JSON file. "
-                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning(
+                "Couldn't reach server at '{}' to download model card file or "
+                "model card file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
+            )
             logger.warning("Creating an empty model card.")
 
             # We fall back on creating an empty model card
@@ -203,7 +217,7 @@ class ModelCard(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `ModelCard` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
         dict_obj = json.loads(text)
         return cls(**dict_obj)
@@ -225,5 +239,5 @@ class ModelCard(object):
 
     def to_json_file(self, json_file_path):
         """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
+        with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string())
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index f833b6d6b..3d55bcd64 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -1,4 +1,3 @@
-
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
 #
@@ -30,14 +29,14 @@ logger = logging.getLogger(__name__)
 
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
 }
 
 
@@ -48,8 +47,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -65,7 +66,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
 
     for name, array in zip(names, arrays):
         print(name)
-    
+
     for name, array in zip(names, arrays):
         original_name = name
 
@@ -75,10 +76,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         # Renaming and simplifying
         name = name.replace("ffn_1", "ffn")
         name = name.replace("bert/", "albert/")
-        name = name.replace("attention_1", "attention")   
+        name = name.replace("attention_1", "attention")
         name = name.replace("transform/", "")
-        name = name.replace("LayerNorm_1", "full_layer_layer_norm")    
-        name = name.replace("LayerNorm", "attention/LayerNorm")   
+        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
+        name = name.replace("LayerNorm", "attention/LayerNorm")
         name = name.replace("transformer/", "")
 
         # The feed forward layer had an 'intermediate' step which has been abstracted away
@@ -97,19 +98,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         name = name.replace("predictions/attention", "predictions")
 
         # Naming was changed to be more explicit
-        name = name.replace("embeddings/attention", "embeddings")    
-        name = name.replace("inner_group_", "albert_layers/") 
-        name = name.replace("group_", "albert_layer_groups/")   
+        name = name.replace("embeddings/attention", "embeddings")
+        name = name.replace("inner_group_", "albert_layers/")
+        name = name.replace("group_", "albert_layer_groups/")
 
         # Classifier
         if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
             name = "classifier/" + name
 
-        # No ALBERT model currently handles the next sentence prediction task 
+        # No ALBERT model currently handles the next sentence prediction task
         if "seq_relationship" in name:
             continue
 
-        name = name.split('/')
+        name = name.split("/")
 
         # Ignore the gradients applied by the LAMB/ADAM optimizers.
         if "adam_m" in name or "adam_v" in name or "global_step" in name:
@@ -118,19 +119,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
 
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
 
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -141,9 +142,9 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
                 num = int(l[1])
                 pointer = pointer[num]
 
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -160,6 +161,7 @@ class AlbertEmbeddings(BertEmbeddings):
     """
     Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(AlbertEmbeddings, self).__init__(config)
 
@@ -175,7 +177,7 @@ class AlbertAttention(BertSelfAttention):
 
         self.output_attentions = config.output_attentions
         self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size 
+        self.hidden_size = config.hidden_size
         self.attention_head_size = config.hidden_size // config.num_attention_heads
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -237,10 +239,13 @@ class AlbertAttention(BertSelfAttention):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         reshaped_context_layer = context_layer.view(*new_context_layer_shape)
-        
 
         # Should find a better way to do this
-        w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
+        w = (
+            self.dense.weight.t()
+            .view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
+            .to(context_layer.dtype)
+        )
         b = self.dense.bias.to(context_layer.dtype)
 
         projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
@@ -252,11 +257,11 @@ class AlbertAttention(BertSelfAttention):
 class AlbertLayer(nn.Module):
     def __init__(self, config):
         super(AlbertLayer, self).__init__()
-        
+
         self.config = config
         self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.attention = AlbertAttention(config)
-        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) 
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
         self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
         self.activation = ACT2FN[config.hidden_act]
 
@@ -273,7 +278,7 @@ class AlbertLayer(nn.Module):
 class AlbertLayerGroup(nn.Module):
     def __init__(self, config):
         super(AlbertLayerGroup, self).__init__()
-        
+
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
         self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
@@ -303,7 +308,7 @@ class AlbertLayerGroup(nn.Module):
 class AlbertTransformer(nn.Module):
     def __init__(self, config):
         super(AlbertTransformer, self).__init__()
-        
+
         self.config = config
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
@@ -327,8 +332,12 @@ class AlbertTransformer(nn.Module):
 
             # Index of the layer inside the group
             layer_idx = int(i - group_idx * layers_per_group)
-            
-            layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])  
+
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+            )
             hidden_states = layer_group_output[0]
 
             if self.output_attentions:
@@ -337,7 +346,6 @@ class AlbertTransformer(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-        
         outputs = (hidden_states,)
         if self.output_hidden_states:
             outputs = outputs + (all_hidden_states,)
@@ -346,11 +354,11 @@ class AlbertTransformer(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
-
 class AlbertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = AlbertConfig
     pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "albert"
@@ -431,8 +439,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertModel(AlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,8 +512,15 @@ class AlbertModel(AlbertPreTrainedModel):
             inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
             self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -520,31 +539,37 @@ class AlbertModel(AlbertPreTrainedModel):
             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                           inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
 
         sequence_output = encoder_outputs[0]
 
         pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
 
-        outputs = (sequence_output, pooled_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs
 
+
 class AlbertMLMHead(nn.Module):
     def __init__(self, config):
         super(AlbertMLMHead, self).__init__()
@@ -566,7 +591,9 @@ class AlbertMLMHead(nn.Module):
         return prediction_scores
 
 
-@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    "Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class AlbertForMaskedLM(AlbertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -602,21 +629,28 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        self._tie_or_clone_weights(self.predictions.decoder,
-                                   self.albert.embeddings.word_embeddings)
+        self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
 
     def get_output_embeddings(self):
         return self.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
         outputs = self.albert(
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
         sequence_outputs = outputs[0]
 
@@ -631,9 +665,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -665,6 +702,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(AlbertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -675,8 +713,16 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
 
         outputs = self.albert(
             input_ids=input_ids,
@@ -684,7 +730,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
 
         pooled_output = outputs[1]
@@ -707,10 +753,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-
-@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -752,6 +800,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(AlbertForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -761,8 +810,17 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                inputs_embeds=None, start_positions=None, end_positions=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
 
         outputs = self.albert(
             input_ids=input_ids,
@@ -770,7 +828,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
 
         sequence_output = outputs[0]
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 6b49efd37..31e9ee6bd 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,31 +18,87 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
-                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
-
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
-    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .configuration_auto import (
+    AlbertConfig,
+    BertConfig,
+    CamembertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+    XLMRobertaConfig,
+)
+
+from .modeling_bert import (
+    BertModel,
+    BertForMaskedLM,
+    BertForSequenceClassification,
+    BertForQuestionAnswering,
+    BertForTokenClassification,
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
-    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
-    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
-    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
-    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
-    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
-    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlnet import (
+    XLNetModel,
+    XLNetLMHeadModel,
+    XLNetForSequenceClassification,
+    XLNetForQuestionAnswering,
+    XLNetForTokenClassification,
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_xlm import (
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForQuestionAnswering,
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_distilbert import (
+    DistilBertModel,
+    DistilBertForQuestionAnswering,
+    DistilBertForMaskedLM,
+    DistilBertForSequenceClassification,
+    DistilBertForTokenClassification,
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_camembert import (
+    CamembertModel,
+    CamembertForMaskedLM,
+    CamembertForSequenceClassification,
+    CamembertForMultipleChoice,
+    CamembertForTokenClassification,
+    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_albert import (
+    AlbertModel,
+    AlbertForMaskedLM,
+    AlbertForSequenceClassification,
+    AlbertForQuestionAnswering,
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
-    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import (
+    XLMRobertaModel,
+    XLMRobertaForMaskedLM,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForMultipleChoice,
+    XLMRobertaForTokenClassification,
+    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -51,7 +107,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
-ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -66,8 +123,9 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class AutoModel(object):
@@ -98,10 +156,13 @@ class AutoModel(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModel is designed to be instantiated "
             "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModel.from_config(config)` methods.")
+            "`AutoModel.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -232,35 +293,39 @@ class AutoModel(object):
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelWithLMHead(object):
@@ -291,10 +356,13 @@ class AutoModelWithLMHead(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelWithLMHead is designed to be instantiated "
             "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelWithLMHead.from_config(config)` methods.")
+            "`AutoModelWithLMHead.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -423,35 +491,39 @@ class AutoModelWithLMHead(object):
             model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelForSequenceClassification(object):
@@ -477,10 +549,13 @@ class AutoModelForSequenceClassification(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForSequenceClassification is designed to be instantiated "
             "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForSequenceClassification.from_config(config)` methods.")
+            "`AutoModelForSequenceClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -597,25 +672,39 @@ class AutoModelForSequenceClassification(object):
             model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
-            return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return AlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "camembert" in pretrained_model_name_or_path:
+            return CamembertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
             return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
 
 
 class AutoModelForQuestionAnswering(object):
@@ -638,10 +727,13 @@ class AutoModelForQuestionAnswering(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "AutoModelForQuestionAnswering is designed to be instantiated "
             "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`AutoModelForQuestionAnswering.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -745,26 +837,30 @@ class AutoModelForQuestionAnswering(object):
             model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
             return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)
+        )
 
 
 class AutoModelForTokenClassification:
     def __init__(self):
-        raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
-                               "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+        raise EnvironmentError(
+            "AutoModelForTokenClassification is designed to be instantiated "
+            "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -797,7 +893,7 @@ class AutoModelForTokenClassification:
         elif isinstance(config, XLMRobertaConfig):
             return XLMRobertaForTokenClassification(config)
         raise ValueError("Unrecognized configuration class {}".format(config))
-    
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the question answering model classes of the library
@@ -870,18 +966,28 @@ class AutoModelForTokenClassification:
             model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'camembert' in pretrained_model_name_or_path:
-            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
-            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        if "camembert" in pretrained_model_name_or_path:
+            return CamembertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "distilbert" in pretrained_model_name_or_path:
+            return DistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm-roberta" in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index ca07a81ae..0994e832d 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -33,27 +33,27 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
-    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
@@ -65,8 +65,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -81,7 +83,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -89,18 +91,18 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
             continue
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            if l[0] == "kernel" or l[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "output_bias" or l[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "squad":
+                pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -110,9 +112,9 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -157,6 +159,7 @@ BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
@@ -199,7 +202,8 @@ class BertSelfAttention(nn.Module):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
@@ -217,7 +221,14 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -307,8 +318,17 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -353,13 +373,22 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+            cross_attention_outputs = self.crossattention(
+                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
+            )
             attention_output = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 
@@ -376,14 +405,23 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
+            layer_outputs = layer_module(
+                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
+            )
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
@@ -440,9 +478,7 @@ class BertLMPredictionHead(nn.Module):
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
@@ -488,6 +524,7 @@ class BertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = BertConfig
     pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_bert
@@ -581,8 +618,12 @@ BERT_INPUTS_DOCSTRING = r"""
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertModel(BertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -612,6 +653,7 @@ class BertModel(BertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(BertModel, self).__init__(config)
         self.config = config
@@ -636,8 +678,17 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
-                head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
         """ Forward pass on the Model.
 
         The model can behave as an encoder (with only self-attention) as well
@@ -681,12 +732,18 @@ class BertModel(BertPreTrainedModel):
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(torch.long)  # not converting to long will cause errors with pytorch version < 1.3
+                causal_mask = causal_mask.to(
+                    torch.long
+                )  # not converting to long will cause errors with pytorch version < 1.3
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
         else:
-            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
@@ -709,10 +766,15 @@ class BertModel(BertPreTrainedModel):
             elif encoder_attention_mask.dim() == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
             else:
-                raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
-                                                                                                                               encoder_attention_mask.shape))
-
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+                raise ValueError(
+                    "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
+                        encoder_hidden_shape, encoder_attention_mask.shape
+                    )
+                )
+
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
         else:
             encoder_extended_attention_mask = None
@@ -727,28 +789,40 @@ class BertModel(BertPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
-        encoder_outputs = self.encoder(embedding_output,
-                                       attention_mask=extended_attention_mask,
-                                       head_mask=head_mask,
-                                       encoder_hidden_states=encoder_hidden_states,
-                                       encoder_attention_mask=encoder_extended_attention_mask)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
                        a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForPreTraining(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -786,6 +860,7 @@ class BertForPreTraining(BertPreTrainedModel):
         prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForPreTraining, self).__init__(config)
 
@@ -797,20 +872,33 @@ class BertForPreTraining(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, next_sentence_label=None):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        next_sentence_label=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
 
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss()
@@ -822,9 +910,9 @@ class BertForPreTraining(BertPreTrainedModel):
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class BertForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -862,6 +950,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
 
@@ -873,17 +962,30 @@ class BertForMaskedLM(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds,
-                            encoder_hidden_states=encoder_hidden_states,
-                            encoder_attention_mask=encoder_attention_mask)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        lm_labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -912,9 +1014,11 @@ class BertForMaskedLM(BertPreTrainedModel):
         return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForNextSentencePrediction(BertPreTrainedModel):
     r"""
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -945,6 +1049,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         seq_relationship_scores = outputs[0]
 
     """
+
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
 
@@ -953,15 +1058,25 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                next_sentence_label=None):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -976,10 +1091,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                       the pooled output) e.g. for GLUE tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForSequenceClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1011,6 +1128,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1021,15 +1139,25 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -1051,10 +1179,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
                       the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForMultipleChoice(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1087,6 +1217,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
         loss, classification_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForMultipleChoice, self).__init__(config)
 
@@ -1096,8 +1227,16 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
         num_choices = input_ids.shape[1]
 
         input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1105,12 +1244,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
 
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -1128,10 +1269,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForTokenClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -1161,6 +1304,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1171,15 +1315,25 @@ class BertForTokenClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1202,10 +1356,12 @@ class BertForTokenClassification(BertPreTrainedModel):
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                       the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class BertForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1247,6 +1403,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
 
     """
+
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1256,15 +1413,26 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py
index 1b808bfd8..2a7a7a733 100644
--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -15,19 +15,24 @@
 # limitations under the License.
 """PyTorch CamemBERT model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForMultipleChoice,
+    RobertaForTokenClassification,
+)
 from .configuration_camembert import CamembertConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
+    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
 }
 
 
@@ -100,8 +105,12 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
-                      CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertModel(RobertaModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -149,8 +158,11 @@ class CamembertModel(RobertaModel):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMaskedLM(RobertaForMaskedLM):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -185,9 +197,12 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForSequenceClassification(RobertaForSequenceClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -223,9 +238,12 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForMultipleChoice(RobertaForMultipleChoice):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -257,9 +275,12 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """CamemBERT Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+    CAMEMBERT_START_DOCSTRING,
+    CAMEMBERT_INPUTS_DOCSTRING,
+)
 class CamembertForTokenClassification(RobertaForTokenClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index fabb79efd..37c15cf54 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -40,14 +40,17 @@ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-
 
 
 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size)
+    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
     return pos * angle_rates
 
+
 def positional_encoding(position, d_model_size, dtype):
     # create the sinusoidal pattern for the positional encoding
-    angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1),
-                  torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
-                  d_model_size))
+    angle_rads = angle_defn(
+        torch.arange(position, dtype=dtype).unsqueeze(1),
+        torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
+        d_model_size,
+    )
 
     sines = torch.sin(angle_rads[:, 0::2])
     cosines = torch.cos(angle_rads[:, 1::2])
@@ -55,22 +58,23 @@ def positional_encoding(position, d_model_size, dtype):
     pos_encoding = torch.cat([sines, cosines], dim=-1)
     return pos_encoding
 
+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
     # calculate attention
-    matmul_qk = torch.matmul(q, k.permute(0,1,3,2))
+    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
 
     dk = k.shape[-1]
     scaled_attention_logits = matmul_qk / np.sqrt(dk)
 
     if mask is not None:
         nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
-        scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
+        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
 
     if attention_mask is not None:
         # Apply the attention mask
         scaled_attention_logits = scaled_attention_logits + attention_mask
 
-    attention_weights = torch.softmax(scaled_attention_logits, dim=-1) 
+    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
 
     # Mask heads if we want to
     if head_mask is not None:
@@ -128,11 +132,8 @@ class MultiHeadAttention(torch.nn.Module):
         return outputs
 
 
-
 def point_wise_feed_forward_network(d_model_size, dff):
-    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff),
-                               torch.nn.ReLU(),
-                               torch.nn.Linear(dff, d_model_size))
+    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))
 
 
 class EncoderLayer(torch.nn.Module):
@@ -150,10 +151,9 @@ class EncoderLayer(torch.nn.Module):
 
     def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None):
         normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention(normed, normed, normed, mask,
-                                                      layer_past=layer_past,
-                                                      attention_mask=attention_mask,
-                                                      head_mask=head_mask)
+        attn_outputs = self.multi_head_attention(
+            normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
         attn_output = attn_outputs[0]
         attn_output = self.dropout1(attn_output)
         out1 = x + attn_output
@@ -171,6 +171,7 @@ class CTRLPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = CTRLConfig
     pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -244,8 +245,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLModel(CTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -273,6 +278,7 @@ class CTRLModel(CTRLPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(CTRLModel, self).__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -287,11 +293,12 @@ class CTRLModel(CTRLPreTrainedModel):
         self.w = nn.Embedding(config.vocab_size, config.n_embd)
 
         self.dropout = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([EncoderLayer(config.n_embd,
-                                             config.n_head,
-                                             config.dff,
-                                             config.resid_pdrop,
-                                             config.output_attentions) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList(
+            [
+                EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions)
+                for _ in range(config.n_layer)
+            ]
+        )
         self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         self.init_weights()
@@ -309,7 +316,16 @@ class CTRLModel(CTRLPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -345,7 +361,7 @@ class CTRLModel(CTRLPreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -357,8 +373,12 @@ class CTRLModel(CTRLPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -391,11 +411,9 @@ class CTRLModel(CTRLPreTrainedModel):
         for i, (h, layer_past) in enumerate(zip(self.h, past)):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = h(hidden_states,
-                        mask,
-                        layer_past=layer_past,
-                        attention_mask=attention_mask,
-                        head_mask=head_mask[i])
+            outputs = h(
+                hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )
             hidden_states, present = outputs[:2]
             if self.output_past:
                 presents = presents + (present,)
@@ -421,8 +439,12 @@ class CTRLModel(CTRLPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class CTRLLMHeadModel(CTRLPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -463,6 +485,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(CTRLLMHeadModel, self).__init__(config)
         self.transformer = CTRLModel(config)
@@ -473,15 +496,26 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         hidden_states = transformer_outputs[0]
 
@@ -495,8 +529,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 7098529c9..7345c2365 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -37,14 +37,15 @@ from .configuration_distilbert import DistilBertConfig
 from .file_utils import add_start_docstrings
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
-    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
+    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
 }
 
 
@@ -52,26 +53,24 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 def gelu(x):
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
     out.detach_()
     out.requires_grad = False
 
+
 class Embeddings(nn.Module):
-    def __init__(self,
-                 config):
+    def __init__(self, config):
         super(Embeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
         if config.sinusoidal_pos_embds:
-            create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
-                                         dim=config.dim,
-                                         out=self.position_embeddings.weight)
+            create_sinusoidal_embeddings(
+                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
+            )
 
         self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
         self.dropout = nn.Dropout(config.dropout)
@@ -89,17 +88,18 @@ class Embeddings(nn.Module):
             The embedded tokens (plus position embeddings, no token_type embeddings)
         """
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)
 
-        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
-        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
+        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
+        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
 
         embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings)               # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
         return embeddings
 
+
 class MultiHeadSelfAttention(nn.Module):
     def __init__(self, config):
         super(MultiHeadSelfAttention, self).__init__()
@@ -139,7 +139,7 @@ class MultiHeadSelfAttention(nn.Module):
         self.dim = attention_head_size * self.n_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, query, key, value, mask, head_mask = None):
+    def forward(self, query, key, value, mask, head_mask=None):
         """
         Parameters
         ----------
@@ -172,39 +172,42 @@ class MultiHeadSelfAttention(nn.Module):
             """ group heads """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
-        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
-        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2,3))          # (bs, n_heads, q_length, k_length)
-        mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
-        scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
+        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
+        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)
 
-        weights = nn.Softmax(dim=-1)(scores)   # (bs, n_heads, q_length, k_length)
-        weights = self.dropout(weights)        # (bs, n_heads, q_length, k_length)
+        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)     # (bs, n_heads, q_length, dim_per_head)
-        context = unshape(context)             # (bs, q_length, dim)
-        context = self.out_lin(context)        # (bs, q_length, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
 
         if self.output_attentions:
             return (context, weights)
         else:
             return (context,)
 
+
 class FFN(nn.Module):
     def __init__(self, config):
         super(FFN, self).__init__()
         self.dropout = nn.Dropout(p=config.dropout)
         self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
         self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = gelu if config.activation == "gelu" else nn.ReLU()
 
     def forward(self, input):
         x = self.lin1(input)
@@ -213,6 +216,7 @@ class FFN(nn.Module):
         x = self.dropout(x)
         return x
 
+
 class TransformerBlock(nn.Module):
     def __init__(self, config):
         super(TransformerBlock, self).__init__()
@@ -249,14 +253,14 @@ class TransformerBlock(nn.Module):
         # Self-Attention
         sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
         if self.output_attentions:
-            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attention` or `output_hidden_states` cases returning tuples
             assert type(sa_output) == tuple
             sa_output = sa_output[0]
-        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
 
         # Feed Forward Network
-        ffn_output = self.ffn(sa_output)                             # (bs, seq_length, dim)
+        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
         ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
 
         output = (ffn_output,)
@@ -303,9 +307,7 @@ class Transformer(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_state,)
 
-            layer_outputs = layer_module(x=hidden_state,
-                                         attn_mask=attn_mask,
-                                         head_mask=head_mask[i])
+            layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i])
             hidden_state = layer_outputs[-1]
 
             if self.output_attentions:
@@ -332,6 +334,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     config_class = DistilBertConfig
     pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
@@ -396,8 +399,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertModel(DistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -420,11 +427,12 @@ class DistilBertModel(DistilBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(DistilBertModel, self).__init__(config)
 
-        self.embeddings = Embeddings(config)   # Embeddings
-        self.transformer = Transformer(config) # Encoder
+        self.embeddings = Embeddings(config)  # Embeddings
+        self.transformer = Transformer(config)  # Encoder
 
         self.init_weights()
 
@@ -442,8 +450,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.transformer.layer[layer].attention.prune_heads(heads)
 
-    def forward(self,
-                input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -456,7 +463,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
+            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -468,24 +475,29 @@ class DistilBertModel(DistilBertPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
         if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)   # (bs, seq_length, dim)
-        tfmr_output = self.transformer(x=inputs_embeds,
-                                       attn_mask=attention_mask,
-                                       head_mask=head_mask)
+            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
+        tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask)
         hidden_state = tfmr_output[0]
-        output = (hidden_state, ) + tfmr_output[1:]
+        output = (hidden_state,) + tfmr_output[1:]
 
-        return output # last-layer hidden-state, (all hidden_states), (all attentions)
+        return output  # last-layer hidden-state, (all hidden_states), (all attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -516,6 +528,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForMaskedLM, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -534,28 +547,31 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
         return self.vocab_projector
 
     def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
-        dlbrt_output = self.distilbert(input_ids=input_ids,
-                                       attention_mask=attention_mask,
-                                       head_mask=head_mask,
-                                       inputs_embeds=inputs_embeds)
-        hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
-        prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
+        dlbrt_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
 
-        outputs = (prediction_logits, ) + dlbrt_output[1:]
+        outputs = (prediction_logits,) + dlbrt_output[1:]
         if masked_lm_labels is not None:
-            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
-                                         masked_lm_labels.view(-1))
-            outputs = (mlm_loss,) + outputs     
+            mlm_loss = self.mlm_loss_fct(
+                prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
+            )
+            outputs = (mlm_loss,) + outputs
 
-        return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
+        return outputs  # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
 
 
-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                          the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -587,6 +603,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -599,16 +616,15 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         self.init_weights()
 
     def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
-        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]                    # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
-        pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
-        pooled_output = self.dropout(pooled_output)         # (bs, dim)
-        logits = self.classifier(pooled_output)              # (bs, dim)
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output)  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, dim)
 
         outputs = (logits,) + distilbert_output[1:]
         if labels is not None:
@@ -623,9 +639,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                          the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -663,6 +682,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         loss, start_scores, end_scores = outputs[:3]
 
     """
+
     def __init__(self, config):
         super(DistilBertForQuestionAnswering, self).__init__(config)
 
@@ -672,19 +692,26 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         self.dropout = nn.Dropout(config.qa_dropout)
 
         self.init_weights()
-        
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
-        distilbert_output = self.distilbert(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            head_mask=head_mask,
-                                            inputs_embeds=inputs_embeds)
-        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
-
-        hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        distilbert_output = self.distilbert(
+            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+
+        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
         start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)                           # (bs, max_query_len)
-        end_logits = end_logits.squeeze(-1)                               # (bs, max_query_len)
+        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)
 
         outputs = (start_logits, end_logits,) + distilbert_output[1:]
         if start_positions is not None and end_positions is not None:
@@ -707,10 +734,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      DISTILBERT_START_DOCSTRING,
-                      DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -740,6 +769,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
         loss, scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(DistilBertForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -750,13 +780,11 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None,
-                inputs_embeds=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
 
-        outputs = self.distilbert(input_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+        outputs = self.distilbert(
+            input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
+        )
 
         sequence_output = outputs[0]
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index ddfebdc39..e5bad422c 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -145,16 +145,12 @@ class PreTrainedEncoderDecoder(nn.Module):
         # by the value of the flag `is_decoder` that we need to set correctly.
         encoder = kwargs_encoder.pop("model", None)
         if encoder is None:
-            encoder = AutoModel.from_pretrained(
-                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
-            )
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
         encoder.config.is_decoder = False
 
         decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
-            decoder = AutoModelWithLMHead.from_pretrained(
-                decoder_pretrained_model_name_or_path, **kwargs_decoder
-            )
+            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
         decoder.config.is_decoder = True
 
         model = cls(encoder, decoder)
@@ -168,18 +164,23 @@ class PreTrainedEncoderDecoder(nn.Module):
         We save the encoder' and decoder's parameters in two separate directories.
         """
 
-        # If the root output directory does not exist, create it 
+        # If the root output directory does not exist, create it
         if not os.path.exists(save_directory):
             os.mkdir(save_directory)
 
         # Check whether the output directory is empty or not
-        sub_directories = [directory for directory in os.listdir(save_directory)
-            if os.path.isdir(os.path.join(save_directory, directory))]
+        sub_directories = [
+            directory
+            for directory in os.listdir(save_directory)
+            if os.path.isdir(os.path.join(save_directory, directory))
+        ]
 
         if len(sub_directories) > 0:
             if "encoder" in sub_directories and "decoder" in sub_directories:
-                print("WARNING: there is an older version of encoder-decoder saved in" +\
-                    " the output directory. The default behaviour is to overwrite them.")
+                print(
+                    "WARNING: there is an older version of encoder-decoder saved in"
+                    + " the output directory. The default behaviour is to overwrite them."
+                )
 
             # Empty the output directory
             for directory_to_remove in sub_directories:
@@ -190,7 +191,7 @@ class PreTrainedEncoderDecoder(nn.Module):
                 # Remove the subdirectory itself
                 os.rmdir(os.path.join(save_directory, directory_to_remove))
 
-            assert(len(os.listdir(save_directory)) == 0) # sanity check
+            assert len(os.listdir(save_directory)) == 0  # sanity check
 
         # Create the "encoder" directory inside the output directory and save the encoder into it
         if not os.path.exists(os.path.join(save_directory, "encoder")):
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 3a7561ca5..fe8a973f0 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -36,11 +36,14 @@ from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
-                                     "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
+    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",
+}
+
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -50,8 +53,10 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(gpt2_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -67,20 +72,20 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
 
     for name, array in zip(names, arrays):
         name = name[6:]  # skip "model/"
-        name = name.split('/')
+        name = name.split("/")
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                l = re.split(r"(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'w' or l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'wpe' or l[0] == 'wte':
+            if l[0] == "w" or l[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "wpe" or l[0] == "wte":
                 pointer = getattr(pointer, l[0])
-                pointer = getattr(pointer, 'weight')
+                pointer = getattr(pointer, "weight")
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -130,7 +135,7 @@ class Attention(nn.Module):
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
 
         # Prune conv1d layers
         self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
@@ -146,7 +151,7 @@ class Attention(nn.Module):
         if self.scale:
             w = w / math.sqrt(v.size(-1))
         nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns-nd:ns, :ns]
+        b = self.bias[:, :, ns - nd : ns, :ns]
         w = w * b - 1e4 * (1 - b)
 
         if attention_mask is not None:
@@ -226,10 +231,9 @@ class Block(nn.Module):
         self.mlp = MLP(4 * nx, config)
 
     def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x),
-                                layer_past=layer_past,
-                                attention_mask=attention_mask,
-                                head_mask=head_mask)
+        output_attn = self.attn(
+            self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
+        )
         a = output_attn[0]  # output_attn: a, present, (attentions)
 
         x = x + a
@@ -244,6 +248,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = GPT2Config
     pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_gpt2
@@ -321,8 +326,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2Model(GPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -350,6 +359,7 @@ class GPT2Model(GPT2PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(GPT2Model, self).__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -377,7 +387,16 @@ class GPT2Model(GPT2PreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -418,7 +437,7 @@ class GPT2Model(GPT2PreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -430,8 +449,12 @@ class GPT2Model(GPT2PreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -454,10 +477,9 @@ class GPT2Model(GPT2PreTrainedModel):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
-            outputs = block(hidden_states,
-                            layer_past=layer_past,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask[i])
+            outputs = block(
+                hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
+            )
 
             hidden_states, present = outputs[:2]
             if self.output_past:
@@ -486,8 +508,12 @@ class GPT2Model(GPT2PreTrainedModel):
         return outputs  # last hidden state, (presents), (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +554,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
         self.transformer = GPT2Model(config)
@@ -538,15 +565,26 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -558,18 +596,21 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -632,6 +673,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         config.num_labels = 1
@@ -644,15 +686,28 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               past=past,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         hidden_states = transformer_outputs[0]
 
@@ -662,15 +717,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
             outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/transformers/modeling_mmbt.py b/transformers/modeling_mmbt.py
index 79a717ba2..1c173ac69 100644
--- a/transformers/modeling_mmbt.py
+++ b/transformers/modeling_mmbt.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch MMBT model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -32,6 +31,7 @@ logger = logging.getLogger(__name__)
 class ModalEmbeddings(nn.Module):
     """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
     """
+
     def __init__(self, config, encoder, embeddings):
         super(ModalEmbeddings, self).__init__()
         self.config = config
@@ -62,7 +62,9 @@ class ModalEmbeddings(nn.Module):
             position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
 
         if token_type_ids is None:
-            token_type_ids = torch.zeros((input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device)
+            token_type_ids = torch.zeros(
+                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
+            )
 
         position_embeddings = self.position_embeddings(position_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
@@ -140,8 +142,12 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
-@add_start_docstrings("The bare MMBT Model outputting raw hidden-states without any specific head on top.",
-                      MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTModel(nn.Module):
     r"""
         Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -167,19 +173,29 @@ class MMBTModel(nn.Module):
             encoder = ImageEncoder(args)
             mmbt = MMBTModel(config, transformer, encoder)
         """
+
     def __init__(self, config, transformer, encoder):
         super(MMBTModel, self).__init__()
         self.config = config
         self.transformer = transformer
         self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
 
-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None,
-                modal_end_tokens=None, attention_mask=None,
-                token_type_ids=None, modal_token_type_ids=None,
-                position_ids=None, modal_position_ids=None, head_mask=None,
-                inputs_embeds=None, encoder_hidden_states=None,
-                encoder_attention_mask=None):
-
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+    ):
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -192,21 +208,22 @@ class MMBTModel(nn.Module):
 
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        modal_embeddings = self.modal_encoder(input_modal,
-                                              start_token=modal_start_tokens,
-                                              end_token=modal_end_tokens,
-                                              position_ids=modal_position_ids,
-                                              token_type_ids=modal_token_type_ids)
+        modal_embeddings = self.modal_encoder(
+            input_modal,
+            start_token=modal_start_tokens,
+            end_token=modal_end_tokens,
+            position_ids=modal_position_ids,
+            token_type_ids=modal_token_type_ids,
+        )
 
         input_modal_shape = modal_embeddings.size()[:-1]
 
         if token_type_ids is None:
             token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
 
-        txt_embeddings = self.transformer.embeddings(input_ids=input_ids,
-                                                     position_ids=position_ids,
-                                                     token_type_ids=token_type_ids,
-                                                     inputs_embeds=inputs_embeds)
+        txt_embeddings = self.transformer.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
 
         embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
 
@@ -215,12 +232,16 @@ class MMBTModel(nn.Module):
         if attention_mask is None:
             attention_mask = torch.ones(input_shape, device=device)
         else:
-            attention_mask = torch.cat([torch.ones(input_modal_shape, device=device, dtype=torch.long),  attention_mask], dim=1)
+            attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
+            )
 
         if encoder_attention_mask is None:
             encoder_attention_mask = torch.ones(input_shape, device=device)
         else:
-            encoder_attention_mask = torch.cat([torch.ones(input_modal_shape, device=device),  encoder_attention_mask], dim=1)
+            encoder_attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
+            )
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -254,7 +275,9 @@ class MMBTModel(nn.Module):
         if encoder_attention_mask.dim() == 2:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype
+        )  # fp16 compatibility
         encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -267,25 +290,31 @@ class MMBTModel(nn.Module):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-
-        encoder_outputs = self.transformer.encoder(embedding_output,
-                                                   attention_mask=extended_attention_mask,
-                                                   head_mask=head_mask,
-                                                   encoder_hidden_states=encoder_hidden_states,
-                                                   encoder_attention_mask=encoder_extended_attention_mask)
+        encoder_outputs = self.transformer.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+        )
 
         sequence_output = encoder_outputs[0]
         pooled_output = self.transformer.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
-
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -293,8 +322,12 @@ class MMBTModel(nn.Module):
         self.embeddings.word_embeddings = value
 
 
-@add_start_docstrings("""MMBT Model with a sequence classification/regression head on top (a linear layer on top of
-                      the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """MMBT Model with a sequence classification/regression head on top (a linear layer on top of
+                      the pooled output)""",
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
 class MMBTForClassification(nn.Module):
     r"""
             **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -333,20 +366,35 @@ class MMBTForClassification(nn.Module):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-    def forward(self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None,
-                attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None,
-                modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.mmbt(input_modal=input_modal, input_ids=input_ids,
-                            modal_start_tokens=modal_start_tokens,
-                            modal_end_tokens=modal_end_tokens,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            modal_token_type_ids=modal_token_type_ids,
-                            position_ids=position_ids,
-                            modal_position_ids=modal_position_ids,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.mmbt(
+            input_modal=input_modal,
+            input_ids=input_ids,
+            modal_start_tokens=modal_start_tokens,
+            modal_end_tokens=modal_end_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            modal_token_type_ids=modal_token_type_ids,
+            position_ids=position_ids,
+            modal_position_ids=modal_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         pooled_output = outputs[1]
 
@@ -365,4 +413,4 @@ class MMBTForClassification(nn.Module):
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             outputs = (loss,) + outputs
 
-        return outputs  # (loss), logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # (loss), logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 2f08b4093..ed746ecac 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -36,7 +36,9 @@ from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"
+}
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -45,17 +47,17 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     import re
     import numpy as np
 
-    if '.ckpt' in openai_checkpoint_folder_path:
+    if ".ckpt" in openai_checkpoint_folder_path:
         openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
 
     logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
 
-    with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
+    with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
         names = json.load(names_handle)
-    with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
+    with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
         shapes = json.load(shapes_handle)
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
 
@@ -79,23 +81,23 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     init_params.pop(0)
     init_params.pop(0)
 
-    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+    for name, array in zip(names, init_params):  # names[1:n_transfer], init_params[1:n_transfer]):
         name = name[6:]  # skip "model/"
         assert name[-2:] == ":0"
         name = name[:-2]
-        name = name.split('/')
+        name = name.split("/")
         pointer = model
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                l = re.split(r"(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'w':
-                pointer = getattr(pointer, 'weight')
+            if l[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif l[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif l[0] == "w":
+                pointer = getattr(pointer, "weight")
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -156,7 +158,7 @@ class Attention(nn.Module):
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
         # Prune conv1d layers
         self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
         self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
@@ -172,7 +174,7 @@ class Attention(nn.Module):
         # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
         # XD: self.b may be larger than w, so we need to crop it
         b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + - 1e4 * (1 - b)
+        w = w * b + -1e4 * (1 - b)
 
         if attention_mask is not None:
             # Apply the attention mask
@@ -261,6 +263,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = OpenAIGPTConfig
     pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_openai_gpt
@@ -330,8 +333,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -354,6 +361,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -379,7 +387,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -410,7 +426,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -422,8 +438,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.n_layer
 
@@ -463,8 +483,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -496,6 +520,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -506,14 +531,24 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
@@ -524,18 +559,21 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
@@ -587,6 +625,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
@@ -600,14 +639,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                mc_token_ids=None, lm_labels=None, mc_labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        lm_labels=None,
+        mc_labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -616,15 +667,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
-                            mc_labels.view(-1))
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
             outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index 4faab46f7..730058ea9 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,24 +30,27 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
-    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
-    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
+    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
+    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
 }
 
+
 class RobertaEmbeddings(BertEmbeddings):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
+
     def __init__(self, config):
         super(RobertaEmbeddings, self).__init__(config)
         self.padding_idx = 1
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
-                                                padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
         if position_ids is None:
@@ -58,10 +60,9 @@ class RobertaEmbeddings(BertEmbeddings):
             else:
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
-        return super(RobertaEmbeddings, self).forward(input_ids,
-                                                      token_type_ids=token_type_ids,
-                                                      position_ids=position_ids,
-                                                      inputs_embeds=inputs_embeds)
+        return super(RobertaEmbeddings, self).forward(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
+        )
 
     def create_position_ids_from_input_ids(self, x):
         """ Replace non-padding symbols with their position numbers. Position numbers begin at
@@ -85,8 +86,9 @@ class RobertaEmbeddings(BertEmbeddings):
         input_shape = inputs_embeds.size()[:-1]
         sequence_length = input_shape[1]
 
-        position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long,
-                                    device=inputs_embeds.device)
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
@@ -162,8 +164,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaModel(BertModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -209,8 +215,10 @@ class RobertaModel(BertModel):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
+)
 class RobertaForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -256,14 +264,24 @@ class RobertaForMaskedLM(BertPreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head.decoder
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                masked_lm_labels=None):
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
@@ -299,9 +317,12 @@ class RobertaLMHead(nn.Module):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForSequenceClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -343,15 +364,25 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
 
         self.roberta = RobertaModel(config)
         self.classifier = RobertaClassificationHead(config)
-    
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
-                labels=None):
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         sequence_output = outputs[0]
         logits = self.classifier(sequence_output)
 
@@ -369,9 +400,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForMultipleChoice(BertPreTrainedModel):
     r"""
     Inputs:
@@ -455,16 +489,29 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+        )
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -481,9 +528,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Roberta Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForTokenClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -527,15 +577,25 @@ class RobertaForTokenClassification(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask,
-                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -577,9 +637,12 @@ class RobertaClassificationHead(nn.Module):
         return x
 
 
-@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class RobertaForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -626,14 +689,24 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.roberta(input_ids,
-                               attention_mask=attention_mask,
-                               token_type_ids=token_type_ids,
-                               position_ids=position_ids,
-                               head_mask=head_mask)
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
 
         sequence_output = outputs[0]
 
@@ -660,4 +733,4 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
             total_loss = (start_loss + end_loss) / 2
             outputs = (total_loss,) + outputs
 
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 9baf69d02..2ee8cd011 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -41,11 +41,11 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }
 
 ####################################################
@@ -60,8 +60,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,26 +78,26 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         tf_weights[name] = array
 
     for txt_name in names:
-        name = txt_name.split('/')
+        name = txt_name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
             logger.info("Skipping {}".format("/".join(name)))
             tf_weights.pop(txt_name, None)
             continue
-        if '_slot_' in name[-1]:
+        if "_slot_" in name[-1]:
             logger.info("Skipping {}".format("/".join(name)))
             tf_weights.pop(txt_name, None)
             continue
         pointer = model
         array = tf_weights[txt_name]
         for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                l = re.split(r"_(\d+)", m_name)
             else:
                 l = [m_name]
-            if l[0] in ['kernel', 'scale', 'embedding']:
-                pointer = getattr(pointer, 'weight')
+            if l[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
             # elif l[0] == 'scale':
             #     pointer = getattr(pointer, 'weight')
             # elif l[0] == 'output_bias' or l[0] == 'beta':
@@ -111,9 +113,9 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if l[0] not in ['kernel', 'scale', 'embedding']:
-            pointer = getattr(pointer, 'weight')
-        if l[0] != 'embedding':
+        if l[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if l[0] != "embedding":
             logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
             array = np.transpose(array)
         try:
@@ -125,7 +127,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         pointer.data = torch.from_numpy(array.astype(np.float32))
         tf_weights.pop(txt_name, None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
@@ -136,6 +138,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
 ####################################################
 
+
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """ Construct a layernorm module in the T5 style
@@ -228,10 +231,7 @@ class T5Attention(nn.Module):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     @staticmethod
-    def _relative_position_bucket(relative_position,
-                                  bidirectional=True,
-                                  num_buckets=32,
-                                  max_distance=128):
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -267,12 +267,12 @@ class T5Attention(nn.Module):
 
         # half of the buckets are for exact increments in positions
         max_exact = num_buckets // 2
-        is_small = (n < max_exact)
+        is_small = n < max_exact
 
         # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
         val_if_large = max_exact + (
-            torch.log(n.float() / max_exact)
-            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
         val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
 
         ret += torch.where(is_small, n, val_if_large)
@@ -283,11 +283,13 @@ class T5Attention(nn.Module):
         context_position = torch.arange(qlen, dtype=torch.long)[:, None]
         memory_position = torch.arange(klen, dtype=torch.long)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
-                                                   bidirectional=not self.is_decoder,
-                                                   num_buckets=self.relative_attention_num_buckets)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,  # shape (qlen, klen)
+            bidirectional=not self.is_decoder,
+            num_buckets=self.relative_attention_num_buckets,
+        )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
         return values
 
     def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
@@ -298,7 +300,7 @@ class T5Attention(nn.Module):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = input.size()
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = kv.size(1)
 
@@ -310,45 +312,45 @@ class T5Attention(nn.Module):
             """  compute context """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
 
-        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+        scores = torch.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
             if mask is not None:
-                position_bias = position_bias + mask                          # (bs, n_heads, qlen, klen)
+                position_bias = position_bias + mask  # (bs, n_heads, qlen, klen)
 
         scores += position_bias
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         context = self.o(context)
 
@@ -369,10 +371,9 @@ class T5LayerSelfAttention(nn.Module):
 
     def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(norm_x,
-                                              mask=attention_mask,
-                                              position_bias=position_bias,
-                                              head_mask=head_mask)
+        attention_output = self.SelfAttention(
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -388,11 +389,9 @@ class T5LayerCrossAttention(nn.Module):
 
     def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(norm_x,
-                                                mask=attention_mask,
-                                                kv=kv,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+        attention_output = self.EncDecAttention(
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -411,26 +410,36 @@ class T5Block(nn.Module):
         else:
             self.layer.append(T5LayerFF(config))
 
-    def forward(self, hidden_states, attention_mask=None, position_bias=None,
-                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
-                head_mask=None):
-        self_attention_outputs = self.layer[0](hidden_states,
-                                                attention_mask=attention_mask,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
+        )
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states)
         else:
-            cross_attention_outputs = self.layer[1](hidden_states,
-                                                    kv=encoder_hidden_states,
-                                                    attention_mask=encoder_attention_mask,
-                                                    position_bias=encoder_decoder_position_bias,
-                                                    head_mask=head_mask)
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask,
+            )
             hidden_states = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
+            outputs = (
+                outputs + cross_attention_outputs[1:]
+            )  # Keep cross-attention outputs and relative position weights
             hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
@@ -441,6 +450,7 @@ class T5PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = T5Config
     pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_t5
@@ -450,29 +460,31 @@ class T5PreTrainedModel(PreTrainedModel):
     def dummy_inputs(self):
         input_ids = torch.tensor(DUMMY_INPUTS)
         input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {'decoder_input_ids': input_ids,
-                        'encoder_input_ids': input_ids,
-                        'decoder_attention_mask': input_mask}
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "encoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
         return dummy_inputs
 
     def _init_weights(self, module):
         """ Initialize the weights """
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor*1.0)
+            module.weight.data.fill_(factor * 1.0)
         elif isinstance(module, (T5Model, T5WithLMHeadModel)):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
         elif isinstance(module, T5DenseReluDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
             # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                 module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                 module.wo.bias.data.zero_()
         elif isinstance(module, T5Attention):
             # Mesh TensorFlow attention initialization to avoid scaling before softmax
@@ -480,12 +492,12 @@ class T5PreTrainedModel(PreTrainedModel):
             d_model = self.config.d_model
             d_kv = self.config.d_kv
             n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
             if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
 
 
 class T5Stack(T5PreTrainedModel):
@@ -495,19 +507,22 @@ class T5Stack(T5PreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
         self.is_decoder = config.is_decoder
 
-        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
-                                    for i in range(config.num_layers)])
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
         self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
         self.init_weights()
 
-    def forward(self,
-                hidden_states,
-                attention_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                head_mask=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+    ):
 
         batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
         if attention_mask is None:
@@ -521,9 +536,9 @@ class T5Stack(T5PreTrainedModel):
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
         elif attention_mask.dim() == 2:
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
                 seq_ids = torch.arange(seq_length, device=hidden_states.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
@@ -557,7 +572,9 @@ class T5Stack(T5PreTrainedModel):
             # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
             # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
 
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
             encoder_extended_attention_mask = None
@@ -572,8 +589,12 @@ class T5Stack(T5PreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_layers
 
@@ -587,13 +608,15 @@ class T5Stack(T5PreTrainedModel):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states,
-                                         attention_mask=extended_attention_mask,
-                                         position_bias=position_bias,
-                                         encoder_hidden_states=encoder_hidden_states,
-                                         encoder_attention_mask=encoder_extended_attention_mask,
-                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
-                                         head_mask=head_mask[i])
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask[i],
+            )
             # layer_outputs is a tuple with:
             # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
             hidden_states = layer_outputs[0]
@@ -672,9 +695,12 @@ T5_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
-                      "without any specific head on top.",
-                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+    T5_INPUTS_DOCSTRING,
+)
 class T5Model(T5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -697,6 +723,7 @@ class T5Model(T5PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(T5Model, self).__init__(config)
         self.shared = nn.Embedding(config.vocab_size, config.d_model)
@@ -729,12 +756,13 @@ class T5Model(T5PreTrainedModel):
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -770,8 +798,7 @@ class T5Model(T5PreTrainedModel):
         return decoder_outputs + encoder_outputs
 
 
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
-    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class T5WithLMHeadModel(T5PreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -802,6 +829,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         loss, prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(T5WithLMHeadModel, self).__init__(config)
         self.model_dim = config.d_model
@@ -834,14 +862,15 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
 
-        lm_labels = kwargs.pop('decoder_lm_labels', None)
+        lm_labels = kwargs.pop("decoder_lm_labels", None)
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -879,8 +908,9 @@ class T5WithLMHeadModel(T5PreTrainedModel):
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            decoder_outputs = (
+                loss,
+            ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
         return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index ac55a73fa..25d086398 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -31,14 +31,14 @@ import logging
 logger = logging.getLogger(__name__)
 
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
-    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
-    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
-    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
-    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
-    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
-    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
-    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
+    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
+    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
+    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
+    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
+    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
+    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
+    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
+    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
 }
 
 
@@ -50,21 +50,22 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
         super(TFAlbertEmbeddings, self).__init__(**kwargs)
 
         self.config = config
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.embedding_size,
-                                                             embeddings_initializer=get_initializer(
-                                                                 self.config.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
-                                                               config.embedding_size,
-                                                               embeddings_initializer=get_initializer(
-                                                                   self.config.initializer_range),
-                                                               name='token_type_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.config.initializer_range),
+            name="token_type_embeddings",
+        )
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def build(self, input_shape):
@@ -75,7 +76,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
             self.word_embeddings = self.add_weight(
                 "weight",
                 shape=[self.config.vocab_size, self.config.embedding_size],
-                initializer=get_initializer(self.config.initializer_range))
+                initializer=get_initializer(self.config.initializer_range),
+            )
         super(TFAlbertEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding", training=False):
@@ -145,34 +147,29 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
         assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(
-            config.hidden_size / config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(
-                                             config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='value')
-
-        self.dropout = tf.keras.layers.Dropout(
-            config.attention_probs_dropout_prob)
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(
-            x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
         return tf.transpose(x, perm=[0, 2, 1, 3])
 
     def call(self, inputs, training=False):
@@ -212,23 +209,21 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
-        outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
         return outputs
 
 
 class TFAlbertSelfOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFAlbertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -245,12 +240,10 @@ class TFAlbertAttention(TFBertSelfAttention):
         super(TFAlbertAttention, self).__init__(config, **kwargs)
 
         self.hidden_size = config.hidden_size
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -293,11 +286,11 @@ class TFAlbertAttention(TFBertSelfAttention):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
-        self_outputs = (context_layer, attention_probs) if self.output_attentions else (
-            context_layer,)
+        self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
 
         hidden_states = self_outputs[0]
 
@@ -313,34 +306,35 @@ class TFAlbertAttention(TFBertSelfAttention):
 class TFAlbertLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFAlbertLayer, self).__init__(**kwargs)
-        self.attention = TFAlbertAttention(config, name='attention')
+        self.attention = TFAlbertAttention(config, name="attention")
 
-        self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn')
+        self.ffn = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
+        )
 
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
 
-        self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='ffn_output')
+        self.ffn_output = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
+        )
         self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
+            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
+        )
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
 
-        attention_outputs = self.attention(
-            [hidden_states, attention_mask, head_mask], training=training)
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
         ffn_output = self.ffn(attention_outputs[0])
         ffn_output = self.activation(ffn_output)
         ffn_output = self.ffn_output(ffn_output)
 
         hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.full_layer_layer_norm(
-            ffn_output + attention_outputs[0])
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0])
 
         # add attentions if we output them
         outputs = (hidden_states,) + attention_outputs[1:]
@@ -353,8 +347,9 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
 
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
-            i)) for i in range(config.inner_group_num)]
+        self.albert_layers = [
+            TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
+        ]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -363,8 +358,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
         layer_attentions = ()
 
         for layer_index, albert_layer in enumerate(self.albert_layers):
-            layer_output = albert_layer(
-                [hidden_states, attention_mask, head_mask[layer_index]], training=training)
+            layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training)
             hidden_states = layer_output[0]
 
             if self.output_attentions:
@@ -389,10 +383,15 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
         self.config = config
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), name='embedding_hidden_mapping_in')
-        self.albert_layer_groups = [TFAlbertLayerGroup(
-            config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
+        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="embedding_hidden_mapping_in",
+        )
+        self.albert_layer_groups = [
+            TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i))
+            for i in range(config.num_hidden_groups)
+        ]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -405,15 +404,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
 
         for i in range(self.config.num_hidden_layers):
             # Number of layers in a hidden group
-            layers_per_group = int(
-                self.config.num_hidden_layers / self.config.num_hidden_groups)
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
 
             # Index of the hidden group
-            group_idx = int(
-                i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
 
             layer_group_output = self.albert_layer_groups[group_idx](
-                [hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
+                [
+                    hidden_states,
+                    attention_mask,
+                    head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                ],
+                training=training,
+            )
             hidden_states = layer_group_output[0]
 
             if self.output_attentions:
@@ -436,6 +439,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = AlbertConfig
     pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "albert"
@@ -446,31 +450,25 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
         super(TFAlbertMLMHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
 
-        self.dense = tf.keras.layers.Dense(config.embedding_size,
-                                           kernel_initializer=get_initializer(
-                                               config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
 
-        self.LayerNorm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
-        self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='decoder/bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.decoder_bias = self.add_weight(
+            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+        )
         super(TFAlbertMLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -560,8 +558,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertModel(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -601,8 +603,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
 
         self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
         self.encoder = TFAlbertTransformer(config, name="encoder")
-        self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
-            config.initializer_range), activation='tanh', name='pooler')
+        self.pooler = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="pooler",
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -617,7 +623,16 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -627,12 +642,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -678,10 +693,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
             head_mask = [None] * self.num_hidden_layers
             # head_mask = tf.constant([0] * self.num_hidden_layers)
 
-        embedding_output = self.embeddings(
-            [input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder(
-            [embedding_output, extended_attention_mask, head_mask], training=training)
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
 
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output[:, 0])
@@ -692,8 +705,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
-                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
+)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -723,9 +737,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.albert = TFAlbertModel(config, name='albert')
-        self.predictions = TFAlbertMLMHead(
-            config, self.albert.embeddings, name='predictions')
+        self.albert = TFAlbertModel(config, name="albert")
+        self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
 
     def get_output_embeddings(self):
         return self.albert.embeddings
@@ -734,8 +747,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
         outputs = self.albert(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.predictions(
-            sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))
 
         # Add hidden states and attention if they are here
         outputs = (prediction_scores,) + outputs[2:]
@@ -743,9 +755,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+    ALBERT_START_DOCSTRING,
+    ALBERT_INPUTS_DOCSTRING,
+)
 class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -771,24 +786,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.albert = TFAlbertModel(config, name='albert')
+        self.albert = TFAlbertModel(config, name="albert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.albert(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # logits, (hidden_states), (attentions)
\ No newline at end of file
+        return outputs  # logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 031ffea17..24a7338d4 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,24 +18,70 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
-                                 GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig)
-
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
-    TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .configuration_auto import (
+    BertConfig,
+    CTRLConfig,
+    DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+)
+
+from .modeling_tf_bert import (
+    TFBertModel,
+    TFBertForMaskedLM,
+    TFBertForSequenceClassification,
+    TFBertForQuestionAnswering,
+    TFBertForTokenClassification,
+    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
-    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
-    TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
-    TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_transfo_xl import (
+    TFTransfoXLModel,
+    TFTransfoXLLMHeadModel,
+    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_xlnet import (
+    TFXLNetModel,
+    TFXLNetLMHeadModel,
+    TFXLNetForSequenceClassification,
+    TFXLNetForQuestionAnsweringSimple,
+    TFXLNetForTokenClassification,
+    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_xlm import (
+    TFXLMModel,
+    TFXLMWithLMHeadModel,
+    TFXLMForSequenceClassification,
+    TFXLMForQuestionAnsweringSimple,
+    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_roberta import (
+    TFRobertaModel,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
+    TFRobertaForTokenClassification,
+    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
+from .modeling_tf_distilbert import (
+    TFDistilBertModel,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForSequenceClassification,
+    TFDistilBertForTokenClassification,
+    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_albert import (
+    TFAlbertModel,
+    TFAlbertForMaskedLM,
+    TFAlbertForSequenceClassification,
+    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+)
 from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .file_utils import add_start_docstrings
@@ -43,7 +89,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
-TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
+    (key, value)
     for pretrained_map in [
         TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -56,8 +103,9 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
+    ]
+    for key, value, in pretrained_map.items()
+)
 
 
 class TFAutoModel(object):
@@ -85,10 +133,13 @@ class TFAutoModel(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModel is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModel is designed to be instantiated "
             "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModel.from_config(config)` methods.")
+            "`TFAutoModel.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -209,32 +260,34 @@ class TFAutoModel(object):
             model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelWithLMHead(object):
@@ -262,10 +315,13 @@ class TFAutoModelWithLMHead(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelWithLMHead is designed to be instantiated "
             "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelWithLMHead.from_config(config)` methods.")
+            "`TFAutoModelWithLMHead.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -390,32 +446,34 @@ class TFAutoModelWithLMHead(object):
             model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForSequenceClassification(object):
@@ -438,10 +496,13 @@ class TFAutoModelForSequenceClassification(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForSequenceClassification is designed to be instantiated "
             "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForSequenceClassification.from_config(config)` methods.")
+            "`TFAutoModelForSequenceClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -552,21 +613,33 @@ class TFAutoModelForSequenceClassification(object):
             model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
-            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
-            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "albert" in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
+            return TFBertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
             return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForQuestionAnswering(object):
@@ -588,10 +661,13 @@ class TFAutoModelForQuestionAnswering(object):
 
         This class cannot be instantiated using `__init__()` (throws an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated "
+        raise EnvironmentError(
+            "TFAutoModelForQuestionAnswering is designed to be instantiated "
             "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForQuestionAnswering.from_config(config)` methods.")
+            "`TFAutoModelForQuestionAnswering.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -698,24 +774,34 @@ class TFAutoModelForQuestionAnswering(object):
             model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        if "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForQuestionAnswering.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "bert" in pretrained_model_name_or_path:
             return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
-            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif "xlnet" in pretrained_model_name_or_path:
+            return TFXLNetForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "xlm" in pretrained_model_name_or_path:
+            return TFXLMForQuestionAnsweringSimple.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
 
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)
+        )
 
 
 class TFAutoModelForTokenClassification:
     def __init__(self):
-        raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated "
-                               "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+        raise EnvironmentError(
+            "TFAutoModelForTokenClassification is designed to be instantiated "
+            "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForTokenClassification.from_config(config)` methods."
+        )
 
     @classmethod
     def from_config(cls, config):
@@ -815,14 +901,20 @@ class TFAutoModelForTokenClassification:
             model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if "bert" in pretrained_model_name_or_path:
             return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
-            return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
+        elif "distilbert" in pretrained_model_name_or_path:
+            return TFDistilBertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        elif "roberta" in pretrained_model_name_or_path:
+            return TFRobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)
+        )
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 9caad53a5..bcb83d5df 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -35,25 +35,25 @@ logger = logging.getLogger(__name__)
 
 
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
-    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
-    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }
 
 
@@ -67,6 +67,7 @@ def gelu(x):
     cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
     return x * cdf
 
+
 def gelu_new(x):
     """Gaussian Error Linear Unit.
     This is a smoother version of the RELU.
@@ -76,41 +77,48 @@ def gelu_new(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
+
 def swish(x):
     return x * tf.sigmoid(x)
 
 
-ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
-          "relu": tf.keras.activations.relu,
-          "swish": tf.keras.layers.Activation(swish),
-          "gelu_new": tf.keras.layers.Activation(gelu_new)}
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+    "gelu_new": tf.keras.layers.Activation(gelu_new),
+}
 
 
 class TFBertEmbeddings(tf.keras.layers.Layer):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config, **kwargs):
         super(TFBertEmbeddings, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
         self.hidden_size = config.hidden_size
         self.initializer_range = config.initializer_range
 
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.hidden_size,
-                                                             embeddings_initializer=get_initializer(self.initializer_range),
-                                                             name='position_embeddings')
-        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
-                                                               config.hidden_size,
-                                                               embeddings_initializer=get_initializer(self.initializer_range),
-                                                               name='token_type_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="token_type_embeddings",
+        )
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def build(self, input_shape):
@@ -121,7 +129,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
             self.word_embeddings = self.add_weight(
                 "weight",
                 shape=[self.vocab_size, self.hidden_size],
-                initializer=get_initializer(self.initializer_range))
+                initializer=get_initializer(self.initializer_range),
+            )
         super(TFBertEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding", training=False):
@@ -193,7 +202,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
         self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
@@ -201,15 +211,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='query')
-        self.key = tf.keras.layers.Dense(self.all_head_size,
-                                         kernel_initializer=get_initializer(config.initializer_range),
-                                         name='key')
-        self.value = tf.keras.layers.Dense(self.all_head_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='value')
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
 
         self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
@@ -230,8 +240,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
+        attention_scores = tf.matmul(
+            query_layer, key_layer, transpose_b=True
+        )  # (batch size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)  # scale attention_scores
         attention_scores = attention_scores / tf.math.sqrt(dk)
 
         if attention_mask is not None:
@@ -252,8 +264,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer,
-                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
 
         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
         return outputs
@@ -262,10 +275,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
 class TFBertSelfOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertSelfOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -280,8 +293,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
 class TFBertAttention(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertAttention, self).__init__(**kwargs)
-        self.self_attention = TFBertSelfAttention(config, name='self')
-        self.dense_output = TFBertSelfOutput(config, name='output')
+        self.self_attention = TFBertSelfAttention(config, name="self")
+        self.dense_output = TFBertSelfOutput(config, name="output")
 
     def prune_heads(self, heads):
         raise NotImplementedError
@@ -298,9 +311,9 @@ class TFBertAttention(tf.keras.layers.Layer):
 class TFBertIntermediate(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertIntermediate, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.intermediate_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
@@ -315,10 +328,10 @@ class TFBertIntermediate(tf.keras.layers.Layer):
 class TFBertOutput(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertOutput, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
     def call(self, inputs, training=False):
@@ -333,9 +346,9 @@ class TFBertOutput(tf.keras.layers.Layer):
 class TFBertLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertLayer, self).__init__(**kwargs)
-        self.attention = TFBertAttention(config, name='attention')
-        self.intermediate = TFBertIntermediate(config, name='intermediate')
-        self.bert_output = TFBertOutput(config, name='output')
+        self.attention = TFBertAttention(config, name="attention")
+        self.intermediate = TFBertIntermediate(config, name="intermediate")
+        self.bert_output = TFBertOutput(config, name="output")
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -353,7 +366,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
         super(TFBertEncoder, self).__init__(**kwargs)
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
+        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
 
     def call(self, inputs, training=False):
         hidden_states, attention_mask, head_mask = inputs
@@ -385,10 +398,12 @@ class TFBertEncoder(tf.keras.layers.Layer):
 class TFBertPooler(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertPooler, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           activation='tanh',
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
 
     def call(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -401,14 +416,14 @@ class TFBertPooler(tf.keras.layers.Layer):
 class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
 
     def call(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -421,17 +436,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertLMPredictionHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
-        self.transform = TFBertPredictionHeadTransform(config, name='transform')
+        self.transform = TFBertPredictionHeadTransform(config, name="transform")
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFBertLMPredictionHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -444,7 +456,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
 class TFBertMLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertMLMHead, self).__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
 
     def call(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
@@ -454,9 +466,9 @@ class TFBertMLMHead(tf.keras.layers.Layer):
 class TFBertNSPHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFBertNSPHead, self).__init__(**kwargs)
-        self.seq_relationship = tf.keras.layers.Dense(2,
-                                                      kernel_initializer=get_initializer(config.initializer_range),
-                                                      name='seq_relationship')
+        self.seq_relationship = tf.keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
+        )
 
     def call(self, pooled_output):
         seq_relationship_score = self.seq_relationship(pooled_output)
@@ -468,9 +480,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         super(TFBertMainLayer, self).__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
 
-        self.embeddings = TFBertEmbeddings(config, name='embeddings')
-        self.encoder = TFBertEncoder(config, name='encoder')
-        self.pooler = TFBertPooler(config, name='pooler')
+        self.embeddings = TFBertEmbeddings(config, name="embeddings")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.pooler = TFBertPooler(config, name="pooler")
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -485,7 +497,16 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -495,12 +516,12 @@ class TFBertMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -552,7 +573,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
@@ -560,6 +583,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = BertConfig
     pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "bert"
@@ -648,8 +672,12 @@ BERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertModel(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -682,18 +710,22 @@ class TFBertModel(TFBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertModel, self).__init__(config, *inputs, **kwargs)
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForPreTraining(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -721,12 +753,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
         prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
     def get_output_embeddings(self):
         return self.bert.embeddings
@@ -735,16 +768,19 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
         seq_relationship_score = self.nsp(pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
 
         return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
+)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -770,11 +806,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
     def get_output_embeddings(self):
         return self.bert.embeddings
@@ -783,15 +820,18 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -817,11 +857,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
         seq_relationship_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
@@ -834,9 +875,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
         return outputs  # seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -862,22 +906,23 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -885,9 +930,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -915,16 +963,26 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         classification_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(1,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
-
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -934,12 +992,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -956,7 +1014,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         outputs = self.bert(flat_inputs, training=training)
 
@@ -971,9 +1036,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         return outputs  # reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForTokenClassification(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -999,22 +1067,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
+        self.bert = TFBertMainLayer(config, name="bert")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -1022,9 +1091,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+    BERT_START_DOCSTRING,
+    BERT_INPUTS_DOCSTRING,
+)
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -1052,14 +1124,15 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.bert = TFBertMainLayer(config, name='bert')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.bert(inputs, **kwargs)
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 0f9b34924..3aba94a50 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -32,15 +32,15 @@ logger = logging.getLogger(__name__)
 
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"}
 
+
 def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model_size))
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
     return pos * angle_rates
 
+
 def positional_encoding(position, d_model_size):
     # create the sinusoidal pattern for the positional encoding
-    angle_rads = angle_defn(np.arange(position)[:, np.newaxis],
-                            np.arange(d_model_size)[np.newaxis, :],
-                            d_model_size)
+    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
 
     sines = np.sin(angle_rads[:, 0::2])
     cosines = np.cos(angle_rads[:, 1::2])
@@ -49,27 +49,28 @@ def positional_encoding(position, d_model_size):
     pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
     return pos_encoding
 
+
 def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
     # calculate attention
     matmul_qk = tf.matmul(q, k, transpose_b=True)
-    
+
     dk = tf.cast(shape_list(k)[-1], tf.float32)
     scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
 
     if mask is not None:
-        scaled_attention_logits += (mask * -1e4)
+        scaled_attention_logits += mask * -1e4
 
     if attention_mask is not None:
         # Apply the attention mask
         scaled_attention_logits = scaled_attention_logits + attention_mask
 
-    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) 
+    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
 
     # Mask heads if we want to
     if head_mask is not None:
         attention_weights = attention_weights * head_mask
 
-    output = tf.matmul(attention_weights, v) 
+    output = tf.matmul(attention_weights, v)
 
     return output, attention_weights
 
@@ -83,11 +84,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
 
         self.depth = int(d_model_size / self.num_heads)
 
-        self.Wq = tf.keras.layers.Dense(d_model_size, name='Wq')
-        self.Wk = tf.keras.layers.Dense(d_model_size, name='Wk')
-        self.Wv = tf.keras.layers.Dense(d_model_size, name='Wv')
+        self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
+        self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
+        self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
 
-        self.dense = tf.keras.layers.Dense(d_model_size, name='dense')
+        self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
 
     def split_into_heads(self, x, batch_size):
         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -113,7 +114,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
         scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
         attn = output[1]
-        original_size_attention = tf.reshape(scaled_attention,  (batch_size, -1, self.d_model_size))
+        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
         output = self.dense(original_size_attention)
 
         outputs = (output, present)
@@ -122,22 +123,22 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         return outputs
 
 
-
 def point_wise_feed_forward_network(d_model_size, dff, name=""):
-    return tf.keras.Sequential([
-            tf.keras.layers.Dense(dff, activation='relu', name="0"), 
-            tf.keras.layers.Dense(d_model_size, name="2")
-        ], name="ffn")
+    return tf.keras.Sequential(
+        [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
+        name="ffn",
+    )
 
 
 class TFEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs):
+    def __init__(
+        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
+    ):
         super(TFEncoderLayer, self).__init__(**kwargs)
 
-        self.multi_head_attention = TFMultiHeadAttention(d_model_size,
-                                                         num_heads,
-                                                         output_attentions,
-                                                         name="multi_head_attention")
+        self.multi_head_attention = TFMultiHeadAttention(
+            d_model_size, num_heads, output_attentions, name="multi_head_attention"
+        )
         self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")
 
         self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
@@ -149,8 +150,9 @@ class TFEncoderLayer(tf.keras.layers.Layer):
     def call(self, inputs, training=False):
         x, mask, layer_past, attention_mask, head_mask = inputs
         normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention([normed, normed, normed, mask, layer_past,
-                                                  attention_mask, head_mask], training=training)
+        attn_outputs = self.multi_head_attention(
+            [normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training
+        )
         attn_output = attn_outputs[0]
         attn_output = self.dropout1(attn_output, training=training)
         out1 = x + attn_output
@@ -176,20 +178,23 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 
         self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
 
-
-        self.w = TFSharedEmbeddings(config.vocab_size,
-                                    config.n_embd,
-                                    initializer_range=config.initializer_range,
-                                    name="w")
+        self.w = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
+        )
 
         self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFEncoderLayer(config.n_embd,
-                                 config.n_head,
-                                 config.dff,
-                                 config.resid_pdrop,
-                                 config.layer_norm_epsilon,
-                                 config.output_attentions,
-                                 name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [
+            TFEncoderLayer(
+                config.n_embd,
+                config.n_head,
+                config.dff,
+                config.resid_pdrop,
+                config.layer_norm_epsilon,
+                config.output_attentions,
+                name="h_._{}".format(i),
+            )
+            for i in range(config.n_layer)
+        ]
         self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
 
     def get_input_embeddings(self):
@@ -204,7 +209,17 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -215,13 +230,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -276,14 +291,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode='embedding')
+            token_type_embeds = self.w(token_type_ids, mode="embedding")
             token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
         else:
             token_type_embeds = 0
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.w(input_ids, mode='embedding')
+            inputs_embeds = self.w(input_ids, mode="embedding")
         seq_len = input_shape[-1]
         mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
 
@@ -333,6 +348,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = CTRLConfig
     pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -392,8 +408,12 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-                                            CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLModel(TFCTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -423,9 +443,10 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFCTRLModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
@@ -442,10 +463,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFCTRLLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -454,8 +472,12 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    CTRL_START_DOCSTRING,
+    CTRL_INPUTS_DOCSTRING,
+)
 class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -486,9 +508,10 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name='transformer')
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
 
         self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
 
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index afd88d7eb..e9e89d2e7 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -36,9 +36,9 @@ logger = logging.getLogger(__name__)
 
 
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
-    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
+    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
+    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
+    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
 }
 
 
@@ -53,6 +53,7 @@ def gelu(x):
     cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
     return x * cdf
 
+
 def gelu_new(x):
     """Gaussian Error Linear Unit.
     This is a smoother version of the RELU.
@@ -62,24 +63,25 @@ def gelu_new(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
+
 class TFEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFEmbeddings, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
         self.dim = config.dim
         self.initializer_range = config.initializer_range
-        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
-                                                  config.dim,
-                                                  initializer_range=config.initializer_range,
-                                                  name='word_embeddings')  # padding_idx=0)
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             config.dim,
-                                                             embeddings_initializer=get_initializer(config.initializer_range),
-                                                             name='position_embeddings')
+        self.word_embeddings = TFSharedEmbeddings(
+            config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
+        )  # padding_idx=0)
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.dim,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
         if config.sinusoidal_pos_embds:
             raise NotImplementedError
 
@@ -92,9 +94,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
             # Create and initialize weights. The random normal initializer was chosen
             # arbitrarily, and works well.
             self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.dim],
-                initializer=get_initializer(self.initializer_range))
+                "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
+            )
         super(TFEmbeddings, self).build(input_shape)
 
     def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
@@ -149,9 +150,9 @@ class TFEmbeddings(tf.keras.layers.Layer):
             inputs_embeds = tf.gather(self.word_embeddings, input_ids)
         position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
 
-        embeddings = inputs_embeds + position_embeddings              # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
+        embeddings = inputs_embeds + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings, training=training)  # (bs, max_seq_length, dim)
         return embeddings
 
     def _linear(self, inputs):
@@ -181,18 +182,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
 
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="q_lin")
-        self.k_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="k_lin")
-        self.v_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="v_lin")
-        self.out_lin = tf.keras.layers.Dense(config.dim,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name="out_lin")
+        self.q_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
+        )
+        self.k_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
+        )
+        self.v_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
+        )
+        self.out_lin = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
+        )
 
         self.pruned_heads = set()
 
@@ -233,44 +234,49 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
             """ group heads """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
-        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
-        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
-        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, q_length, k_length)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
         # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
         scores = scores - 1e30 * (1.0 - mask)
 
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)             # (bs, q_length, dim)
-        context = self.out_lin(context)        # (bs, q_length, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
 
         if self.output_attentions:
             return (context, weights)
         else:
             return (context,)
 
+
 class TFFFN(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFFFN, self).__init__(**kwargs)
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin1")
-        self.lin2 = tf.keras.layers.Dense(config.dim,
-                                          kernel_initializer=get_initializer(config.initializer_range),
-                                          name="lin2")
-        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
-        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
+        self.lin1 = tf.keras.layers.Dense(
+            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
+        )
+        self.lin2 = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
+        )
+        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
+            config.activation
+        )
+        self.activation = (
+            tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu
+        )
 
     def call(self, input, training=False):
         x = self.lin1(input)
@@ -318,14 +324,14 @@ class TFTransformerBlock(tf.keras.layers.Layer):
         # Self-Attention
         sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
         if self.output_attentions:
-            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attention` or `output_hidden_states` cases returning tuples
             # assert type(sa_output) == tuple
             sa_output = sa_output[0]
-        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
 
         # Feed Forward Network
-        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
+        ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
         ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
 
         output = (ffn_output,)
@@ -341,8 +347,7 @@ class TFTransformer(tf.keras.layers.Layer):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
-                      for i in range(config.n_layers)]
+        self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
 
     def call(self, inputs, training=False):
         """
@@ -401,8 +406,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
         super(TFDistilBertMainLayer, self).__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
 
-        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
-        self.transformer = TFTransformer(config, name="transformer") # Encoder
+        self.embeddings = TFEmbeddings(config, name="embeddings")  # Embeddings
+        self.transformer = TFTransformer(config, name="transformer")  # Encoder
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -421,10 +426,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
             assert len(inputs) <= 4, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 4, "Too many inputs."
         else:
             input_ids = inputs
@@ -439,7 +444,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if attention_mask is None:
-            attention_mask = tf.ones(input_shape) # (bs, seq_length)
+            attention_mask = tf.ones(input_shape)  # (bs, seq_length)
         attention_mask = tf.cast(attention_mask, dtype=tf.float32)
 
         # Prepare head mask if needed
@@ -452,10 +457,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
         else:
             head_mask = [None] * self.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)   # (bs, seq_length, dim)
+        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
         tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
 
-        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
+        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
 
 
 ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
@@ -463,6 +468,7 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     config_class = DistilBertConfig
     pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "distilbert"
@@ -534,8 +540,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -561,9 +571,10 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
 
     def call(self, inputs, **kwargs):
         outputs = self.distilbert(inputs, **kwargs)
@@ -580,10 +591,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFDistilBertLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -592,8 +600,11 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -619,6 +630,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
         self.output_attentions = config.output_attentions
@@ -626,9 +638,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         self.vocab_size = config.vocab_size
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.vocab_transform = tf.keras.layers.Dense(config.dim,
-                                                     kernel_initializer=get_initializer(config.initializer_range),
-                                                     name="vocab_transform")
+        self.vocab_transform = tf.keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
+        )
         self.act = tf.keras.layers.Activation(gelu)
         self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
         self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
@@ -639,9 +651,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
-        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
+        hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = self.act(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
         prediction_logits = self.vocab_projector(prediction_logits)
 
@@ -649,9 +661,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                          the pooled output) e.g. for GLUE tasks. """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -677,36 +692,42 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.pre_classifier = tf.keras.layers.Dense(config.dim,
-                                                    kernel_initializer=get_initializer(config.initializer_range),
-                                                    activation='relu',
-                                                    name="pre_classifier")
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name="classifier")
+        self.pre_classifier = tf.keras.layers.Dense(
+            config.dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
         self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
 
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]                    # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
-        logits = self.classifier(pooled_output)              # (bs, dim)
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, dim)
 
         outputs = (logits,) + distilbert_output[1:]
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """DistilBert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -728,22 +749,23 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
         outputs = model(input_ids)
         scores = outputs[0]
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.distilbert(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -751,9 +773,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
         return outputs  # scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                          the hidden-states output to compute `span start logits` and `span end logits`). """,
-                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+    DISTILBERT_START_DOCSTRING,
+    DISTILBERT_INPUTS_DOCSTRING,
+)
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -781,22 +806,23 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
 
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
         assert config.num_labels == 2
         self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
 
     def call(self, inputs, **kwargs):
         distilbert_output = self.distilbert(inputs, **kwargs)
 
-        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
-        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
         start_logits, end_logits = tf.split(logits, 2, axis=-1)
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index 718e8f605..a4722fb34 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -28,17 +28,25 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFConv1D,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+)
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
-                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
+    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
+    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
+    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",
+}
 
 
 def gelu(x):
@@ -50,8 +58,7 @@ def gelu(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -68,8 +75,8 @@ class TFAttention(tf.keras.layers.Layer):
         self.split_size = n_state
         self.scale = scale
 
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
         self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
         self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
         self.pruned_heads = set()
@@ -82,7 +89,7 @@ class TFAttention(tf.keras.layers.Layer):
         """1's in the lower triangle, counting from the lower right corner.
         Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
         """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
         j = tf.range(ns)
         m = i >= j - ns + nd
         return tf.cast(m, dtype)
@@ -92,7 +99,7 @@ class TFAttention(tf.keras.layers.Layer):
         # q, k, v have shape [batch, heads, sequence, features]
         w = tf.matmul(q, k, transpose_b=True)
         if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
+            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
             w = w / tf.math.sqrt(dk)
 
         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
@@ -158,8 +165,8 @@ class TFMLP(tf.keras.layers.Layer):
     def __init__(self, n_state, config, **kwargs):
         super(TFMLP, self).__init__(**kwargs)
         nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
         self.act = gelu
         self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
 
@@ -174,10 +181,10 @@ class TFBlock(tf.keras.layers.Layer):
     def __init__(self, n_ctx, config, scale=False, **kwargs):
         super(TFBlock, self).__init__(**kwargs)
         nx = config.n_embd
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")
 
     def call(self, inputs, training=False):
         x, layer_past, attention_mask, head_mask = inputs
@@ -204,20 +211,18 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
 
-        self.wte = TFSharedEmbeddings(config.vocab_size,
-                                      config.hidden_size,
-                                      initializer_range=config.initializer_range,
-                                      name='wte')
-        self.wpe = tf.keras.layers.Embedding(config.n_positions,
-                                             config.n_embd,
-                                             embeddings_initializer=get_initializer(config.initializer_range),
-                                             name='wpe')
+        self.wte = TFSharedEmbeddings(
+            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        )
+        self.wpe = tf.keras.layers.Embedding(
+            config.n_positions,
+            config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
+        )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
-        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
+        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
 
     def get_input_embeddings(self):
         return self.wte
@@ -231,7 +236,17 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -242,13 +257,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -304,11 +319,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids, mode='embedding')
+            inputs_embeds = self.wte(input_ids, mode="embedding")
         position_embeds = self.wpe(position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode='embedding')
+            token_type_embeds = self.wte(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -353,6 +368,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = GPT2Config
     pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -428,8 +444,12 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2Model(TFGPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -459,17 +479,22 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -500,9 +525,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
         return self.transformer.wte
@@ -518,11 +544,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         return outputs  # lm_logits, presents, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+""",
+    GPT2_START_DOCSTRING,
+    GPT2_INPUTS_DOCSTRING,
+)
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -572,16 +602,30 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
         config.num_labels = 1
-        self.transformer = TFGPT2MainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
 
     def get_output_embeddings(self):
         return self.transformer.wte
 
-    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             past = inputs[1] if len(inputs) > 1 else past
@@ -593,14 +637,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
             mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
             assert len(inputs) <= 8, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            past = inputs.get('past', past)
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            past = inputs.get("past", past)
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
             assert len(inputs) <= 8, "Too many inputs."
         else:
             input_ids = inputs
@@ -617,7 +661,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            past,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         transformer_outputs = self.transformer(flat_inputs, training=training)
         hidden_states = transformer_outputs[0]
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index 791c6dcc1..4720e3c5d 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -28,14 +28,22 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
-from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
-                                TFSequenceSummary, shape_list, get_initializer)
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFConv1D,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+)
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
+TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
+}
 
 
 def gelu(x):
@@ -47,8 +55,7 @@ def gelu(x):
     Returns:
         `x` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -56,9 +63,11 @@ def swish(x):
     return x * tf.math.sigmoid(x)
 
 
-ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
-           "relu": tf.keras.activations.relu,
-           "swish": tf.keras.layers.Activation(swish)}
+ACT_FNS = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+}
 
 
 class TFAttention(tf.keras.layers.Layer):
@@ -74,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
         self.split_size = n_state
         self.scale = scale
 
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
         self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
         self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
         self.pruned_heads = set()
@@ -88,7 +97,7 @@ class TFAttention(tf.keras.layers.Layer):
         """1's in the lower triangle, counting from the lower right corner.
         Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
         """
-        i = tf.range(nd)[:,None]
+        i = tf.range(nd)[:, None]
         j = tf.range(ns)
         m = i >= j - ns + nd
         return tf.cast(m, dtype)
@@ -98,7 +107,7 @@ class TFAttention(tf.keras.layers.Layer):
         # q, k, v have shape [batch, heads, sequence, features]
         w = tf.matmul(q, k, transpose_b=True)
         if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
+            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
             w = w / tf.math.sqrt(dk)
 
         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
@@ -159,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
     def __init__(self, n_state, config, **kwargs):
         super(TFMLP, self).__init__(**kwargs)
         nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
         self.act = gelu
         self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
 
@@ -175,10 +184,10 @@ class TFBlock(tf.keras.layers.Layer):
     def __init__(self, n_ctx, config, scale=False, **kwargs):
         super(TFBlock, self).__init__(**kwargs)
         nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
-        self.mlp = TFMLP(4 * nx, config, name='mlp')
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
+        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.mlp = TFMLP(4 * nx, config, name="mlp")
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
 
     def call(self, inputs, training=False):
         x, attention_mask, head_mask = inputs
@@ -203,19 +212,17 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         self.vocab_size = config.vocab_size
         self.n_embd = config.n_embd
 
-        self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
-                                               config.n_embd,
-                                               initializer_range=config.initializer_range,
-                                               name='tokens_embed')
-        self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
-                                                         config.n_embd,
-                                                         embeddings_initializer=get_initializer(config.initializer_range),
-                                                         name='positions_embed')
+        self.tokens_embed = TFSharedEmbeddings(
+            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
+        )
+        self.positions_embed = tf.keras.layers.Embedding(
+            config.n_positions,
+            config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="positions_embed",
+        )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx,
-                          config,
-                          scale=True,
-                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
 
     def get_input_embeddings(self):
         return self.tokens_embed
@@ -229,7 +236,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -239,12 +255,12 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 6, "Too many inputs."
         else:
             input_ids = inputs
@@ -295,11 +311,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
+            inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
         position_embeds = self.positions_embed(position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
+            token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
@@ -338,6 +354,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = OpenAIGPTConfig
     pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -409,8 +426,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -436,17 +457,22 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """,
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -472,9 +498,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
         return self.transformer.tokens_embed
@@ -490,11 +517,15 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
         return outputs  # lm_logits, (all hidden_states), (attentions)
 
 
-@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+@add_start_docstrings(
+    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+""",
+    OPENAI_GPT_START_DOCSTRING,
+    OPENAI_GPT_INPUTS_DOCSTRING,
+)
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
     r"""
         **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
@@ -536,16 +567,29 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
         config.num_labels = 1
-        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
-        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
 
     def get_output_embeddings(self):
         return self.transformer.tokens_embed
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -556,13 +600,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
             mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
             assert len(inputs) <= 7, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
             assert len(inputs) <= 7, "Too many inputs."
         else:
             input_ids = inputs
@@ -579,7 +623,14 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
         flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
 
-        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
 
         transformer_outputs = self.transformer(flat_inputs, training=training)
         hidden_states = transformer_outputs[0]
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 190caff18..d1073d23a 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ PyTorch - TF 2.0 general utilities."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -25,7 +24,8 @@ import numpy
 
 logger = logging.getLogger(__name__)
 
-def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
+
+def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
     """ Convert a TF 2.0 model variable name in a pytorch model weight name.
 
         Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
@@ -36,26 +36,30 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='')
             - pytorch model weight name
             - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
     """
-    tf_name = tf_name.replace(':0', '')                       # device ids
-    tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name)  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-    tf_name = tf_name.replace('_._', '/')                     # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-    tf_name = re.sub(r'//+', '/', tf_name)                    # Remove empty levels at the end
-    tf_name = tf_name.split('/')                              # Convert from TF2.0 '/' separators to PyTorch '.' separators
-    tf_name = tf_name[1:]                                     # Remove level zero
+    tf_name = tf_name.replace(":0", "")  # device ids
+    tf_name = re.sub(
+        r"/[^/]*___([^/]*)/", r"/\1/", tf_name
+    )  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+    tf_name = tf_name.replace(
+        "_._", "/"
+    )  # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+    tf_name = re.sub(r"//+", "/", tf_name)  # Remove empty levels at the end
+    tf_name = tf_name.split("/")  # Convert from TF2.0 '/' separators to PyTorch '.' separators
+    tf_name = tf_name[1:]  # Remove level zero
 
     # When should we transpose the weights
-    transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
+    transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name)
 
     # Convert standard TF2.0 names in PyTorch names
-    if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
-        tf_name[-1] = 'weight'
-    if tf_name[-1] == 'beta':
-        tf_name[-1] = 'bias'
+    if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
+        tf_name[-1] = "weight"
+    if tf_name[-1] == "beta":
+        tf_name[-1] = "bias"
 
     # Remove prefix if needed
-    tf_name = '.'.join(tf_name)
+    tf_name = ".".join(tf_name)
     if start_prefix_to_remove:
-        tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
+        tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
 
     return tf_name, transpose
 
@@ -63,6 +67,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='')
 #####################
 ### PyTorch => TF 2.0
 
+
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
     """ Load pytorch checkpoints in a TF 2.0 model
     """
@@ -70,17 +75,21 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     pt_path = os.path.abspath(pytorch_checkpoint_path)
     logger.info("Loading PyTorch weights from {}".format(pt_path))
 
-    pt_state_dict = torch.load(pt_path, map_location='cpu')
+    pt_state_dict = torch.load(pt_path, map_location="cpu")
     logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
 
-    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+    return load_pytorch_weights_in_tf2_model(
+        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
+    )
 
 
 def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
@@ -88,7 +97,9 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
     """
     pt_state_dict = pt_model.state_dict()
 
-    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+    return load_pytorch_weights_in_tf2_model(
+        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
+    )
 
 
 def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
@@ -99,8 +110,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         import tensorflow as tf
         from tensorflow.python.keras import backend as K
     except ImportError as e:
-        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     if tf_inputs is None:
@@ -115,10 +128,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     new_keys = []
     for key in pt_state_dict.keys():
         new_key = None
-        if 'gamma' in key:
-            new_key = key.replace('gamma', 'weight')
-        if 'beta' in key:
-            new_key = key.replace('beta', 'bias')
+        if "gamma" in key:
+            new_key = key.replace("gamma", "weight")
+        if "beta" in key:
+            new_key = key.replace("beta", "bias")
         if new_key:
             old_keys.append(key)
             new_keys.append(new_key)
@@ -127,9 +140,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 
     # Make sure we are able to load PyTorch base models as well as derived models (with heads)
     # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ''
+    start_prefix_to_remove = ""
     if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
-        start_prefix_to_remove = tf_model.base_model_prefix + '.'
+        start_prefix_to_remove = tf_model.base_model_prefix + "."
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
     tf_loaded_numel = 0
@@ -137,7 +150,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     all_pytorch_weights = set(list(pt_state_dict.keys()))
     for symbolic_weight in symbolic_weights:
         sw_name = symbolic_weight.name
-        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
+        name, transpose = convert_tf_weight_name_to_pt_weight_name(
+            sw_name, start_prefix_to_remove=start_prefix_to_remove
+        )
 
         # Find associated numpy array in pytorch model state dict
         if name not in pt_state_dict:
@@ -182,6 +197,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 #####################
 ### TF 2.0 => PyTorch
 
+
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
     """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
         We use HDF5 to easily do transfer learning
@@ -191,8 +207,10 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     import transformers
@@ -215,6 +233,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 
     return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
 
+
 def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
     """ Load TF 2.0 model in a pytorch model
     """
@@ -230,8 +249,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
         import tensorflow as tf
         import torch
     except ImportError as e:
-        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise e
 
     new_pt_params_dict = {}
@@ -239,14 +260,16 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
     # Make sure we are able to load PyTorch base models as well as derived models (with heads)
     # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ''
+    start_prefix_to_remove = ""
     if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
-        start_prefix_to_remove = pt_model.base_model_prefix + '.'
+        start_prefix_to_remove = pt_model.base_model_prefix + "."
 
     # Build a map from potential PyTorch weight names to TF 2.0 Variables
     tf_weights_map = {}
     for tf_weight in tf_weights:
-        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
+        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
+            tf_weight.name, start_prefix_to_remove=start_prefix_to_remove
+        )
         tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
 
     all_tf_weights = set(list(tf_weights_map.keys()))
@@ -291,11 +314,13 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
     missing_keys += missing_keys_pt
 
     if len(missing_keys) > 0:
-        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
-            pt_model.__class__.__name__, missing_keys))
+        logger.info(
+            "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys)
+        )
     if len(unexpected_keys) > 0:
-        logger.info("Weights from TF 2.0 model not used in {}: {}".format(
-            pt_model.__class__.__name__, unexpected_keys))
+        logger.info(
+            "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys)
+        )
 
     logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
 
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 15282bd6c..136ab6615 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
@@ -31,16 +30,18 @@ from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
 logger = logging.getLogger(__name__)
 
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
-    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
+    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
+    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
+    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
+    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
 }
 
+
 class TFRobertaEmbeddings(TFBertEmbeddings):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
+
     def __init__(self, config, **kwargs):
         super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
         self.padding_idx = 1
@@ -64,9 +65,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
         """
         seq_length = shape_list(inputs_embeds)[1]
 
-        position_ids = tf.range(self.padding_idx + 1,
-                                seq_length + self.padding_idx + 1,
-                                dtype=tf.int32)[tf.newaxis, :]
+        position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :]
         return position_ids
 
     def _embedding(self, inputs, training=False):
@@ -80,16 +79,19 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
             else:
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
-        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        return super(TFRobertaEmbeddings, self)._embedding(
+            [input_ids, position_ids, token_type_ids, inputs_embeds], training=training
+        )
 
 
 class TFRobertaMainLayer(TFBertMainLayer):
     """
     Same as TFBertMainLayer but uses TFRobertaEmbeddings.
     """
+
     def __init__(self, config, **kwargs):
         super(TFRobertaMainLayer, self).__init__(config, **kwargs)
-        self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
+        self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -99,6 +101,7 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = RobertaConfig
     pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "roberta"
@@ -192,8 +195,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
-                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaModel(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -226,9 +233,10 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
-        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
 
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
@@ -237,13 +245,14 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
 
 class TFRobertaLMHead(tf.keras.layers.Layer):
     """Roberta Head for masked language modeling."""
+
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFRobertaLMHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           name='dense')
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
         self.act = tf.keras.layers.Activation(gelu)
 
         # The output weights are the same as the input embeddings, but there is
@@ -251,10 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
         self.decoder = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFRobertaLMHead, self).build(input_shape)
 
     def call(self, features):
@@ -268,8 +274,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
+)
 class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -297,6 +304,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
@@ -322,14 +330,16 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
 
     def __init__(self, config, **kwargs):
         super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
-        self.dense = tf.keras.layers.Dense(config.hidden_size,
-                                           kernel_initializer=get_initializer(config.initializer_range),
-                                           activation='tanh',
-                                           name="dense")
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.out_proj = tf.keras.layers.Dense(config.num_labels,
-                                              kernel_initializer=get_initializer(config.initializer_range),
-                                              name="out_proj")
+        self.out_proj = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
 
     def call(self, features, training=False):
         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
@@ -340,9 +350,12 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
         return x
 
 
-@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -369,27 +382,31 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
         self.roberta = TFRobertaMainLayer(config, name="roberta")
         self.classifier = TFRobertaClassificationHead(config, name="classifier")
-    
+
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
 
         sequence_output = outputs[0]
-        logits = self.classifier(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output, training=kwargs.get("training", False))
 
         outputs = (logits,) + outputs[2:]
 
         return outputs  # logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """RoBERTa Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+    ROBERTA_START_DOCSTRING,
+    ROBERTA_INPUTS_DOCSTRING,
+)
 class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -415,22 +432,23 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         outputs = self.roberta(inputs, **kwargs)
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index e803e00c8..38a2bf419 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -31,11 +31,11 @@ from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
 logger = logging.getLogger(__name__)
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
-    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
-    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
-    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
-    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
+    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
+    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
+    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
+    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
 }
 
 ####################################################
@@ -44,6 +44,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
 # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
 ####################################################
 
+
 class TFT5LayerNorm(tf.keras.layers.Layer):
     def __init__(self, epsilon=1e-6, **kwargs):
         """ Construct a layernorm module in the T5 style
@@ -54,10 +55,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         """Build shared word embedding layer """
-        self.weight = self.add_weight(
-            "weight",
-            shape=(input_shape[-1],),
-            initializer='ones')
+        self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
         super(TFT5LayerNorm, self).build(input_shape)
 
     def call(self, x):
@@ -69,8 +67,8 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
 class TFT5DenseReluDense(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5DenseReluDense, self).__init__(**kwargs)
-        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
-        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
+        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
+        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.act = tf.keras.activations.relu
 
@@ -85,9 +83,8 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
 class TFT5LayerFF(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5LayerFF, self).__init__(**kwargs)
-        self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def call(self, hidden_states, training=False):
@@ -114,26 +111,23 @@ class TFT5Attention(tf.keras.layers.Layer):
         self.inner_dim = self.n_heads * self.d_kv
 
         # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
-        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
-        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
-        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
+        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q")
+        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k")
+        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v")
+        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
         if self.has_relative_attention_bias:
-            self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
-                                                                     self.n_heads,
-                                                                     name='relative_attention_bias')
+            self.relative_attention_bias = tf.keras.layers.Embedding(
+                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias"
+            )
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
         raise NotImplementedError
 
     @staticmethod
-    def _relative_position_bucket(relative_position,
-                                  bidirectional=True,
-                                  num_buckets=32,
-                                  max_distance=128):
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
@@ -170,7 +164,10 @@ class TFT5Attention(tf.keras.layers.Layer):
         is_small = tf.math.less(n, max_exact)
         val_if_large = max_exact + tf.dtypes.cast(
             tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
-            / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact),
+            tf.int32,
+        )
         val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
         ret += tf.where(is_small, n, val_if_large)
         return ret
@@ -180,11 +177,11 @@ class TFT5Attention(tf.keras.layers.Layer):
         context_position = tf.range(qlen)[:, None]
         memory_position = tf.range(klen)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,
-                                                   bidirectional=not self.is_decoder,
-                                                   num_buckets=self.relative_attention_num_buckets)
+        rp_bucket = self._relative_position_bucket(
+            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets
+        )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
+        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
         return values
 
     def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
@@ -195,7 +192,7 @@ class TFT5Attention(tf.keras.layers.Layer):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = shape_list(input)
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = shape_list(kv)[1]
 
@@ -207,28 +204,28 @@ class TFT5Attention(tf.keras.layers.Layer):
             """  compute context """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
 
-        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
         # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
-        scores = tf.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+        scores = tf.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
@@ -240,15 +237,15 @@ class TFT5Attention(tf.keras.layers.Layer):
                 # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
 
         scores += position_bias
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         context = self.o(context)
 
@@ -263,21 +260,17 @@ class TFT5Attention(tf.keras.layers.Layer):
 class TFT5LayerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super(TFT5LayerSelfAttention, self).__init__(**kwargs)
-        self.SelfAttention = TFT5Attention(config,
-                                           has_relative_attention_bias=has_relative_attention_bias,
-                                           name='SelfAttention')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.SelfAttention = TFT5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
+        )
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, attention_mask=None, position_bias=None,
-             head_mask=None, training=False):
+    def call(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(norm_x,
-                                              mask=attention_mask,
-                                              position_bias=position_bias,
-                                              head_mask=head_mask,
-                                              training=training)
+        attention_output = self.SelfAttention(
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -287,22 +280,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
 class TFT5LayerCrossAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super(TFT5LayerCrossAttention, self).__init__(**kwargs)
-        self.EncDecAttention = TFT5Attention(config,
-                                           has_relative_attention_bias=has_relative_attention_bias,
-                                           name='EncDecAttention')
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                        name='layer_norm')
+        self.EncDecAttention = TFT5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
+        )
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
-             head_mask=None, training=False):
+    def call(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False):
         norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(norm_x,
-                                                mask=attention_mask,
-                                                kv=kv,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask,
-                                                training=training)
+        attention_output = self.EncDecAttention(
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training
+        )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
@@ -314,43 +302,57 @@ class TFT5Block(tf.keras.layers.Layer):
         super(TFT5Block, self).__init__(**kwargs)
         self.is_decoder = config.is_decoder
         self.layer = []
-        self.layer.append(TFT5LayerSelfAttention(config,
-                                                 has_relative_attention_bias=has_relative_attention_bias,
-                                                 name='layer_._0'))
+        self.layer.append(
+            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0")
+        )
         if self.is_decoder:
-            self.layer.append(TFT5LayerCrossAttention(config,
-                                                      has_relative_attention_bias=has_relative_attention_bias,
-                                                      name='layer_._1'))
-            self.layer.append(TFT5LayerFF(config, name='layer_._2'))
+            self.layer.append(
+                TFT5LayerCrossAttention(
+                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1"
+                )
+            )
+            self.layer.append(TFT5LayerFF(config, name="layer_._2"))
         else:
-            self.layer.append(TFT5LayerFF(config, name='layer_._1'))
-
-    def call(self, hidden_states, attention_mask=None, position_bias=None,
-             encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
-             head_mask=None, training=False):
-        self_attention_outputs = self.layer[0](hidden_states,
-                                                attention_mask=attention_mask,
-                                                position_bias=position_bias,
-                                                head_mask=head_mask,
-                                                training=training)
+            self.layer.append(TFT5LayerFF(config, name="layer_._1"))
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+        training=False,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            head_mask=head_mask,
+            training=training,
+        )
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states, training=training)
         else:
-            cross_attention_outputs = self.layer[1](hidden_states,
-                                                    kv=encoder_hidden_states,
-                                                    attention_mask=encoder_attention_mask,
-                                                    position_bias=encoder_decoder_position_bias,
-                                                    head_mask=head_mask,
-                                                    training=training)
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask,
+                training=training,
+            )
             hidden_states = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]
             hidden_states = self.layer[2](hidden_states, training=training)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 ####################################################
@@ -366,12 +368,11 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         self.config = config
         self.num_hidden_layers = config.num_layers
 
-        self.block = [TFT5Block(config,
-                                has_relative_attention_bias=bool(i == 0),
-                                name='block_._{}'.format(i))
-                        for i in range(config.num_layers)]
-        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
-                                              name='final_layer_norm')
+        self.block = [
+            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i))
+            for i in range(config.num_layers)
+        ]
+        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def _resize_token_embeddings(self, new_num_tokens):
@@ -380,8 +381,15 @@ class TFT5MainLayer(tf.keras.layers.Layer):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
 
-    def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
-             encoder_attention_mask=None, head_mask=None, training=False):
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        training=False,
+    ):
 
         batch_size, seq_length = shape_list(hidden_states)[:2]
         if attention_mask is None:
@@ -397,13 +405,14 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         if num_dims_attention_mask == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
         elif num_dims_attention_mask == 2:
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
             if self.config.is_decoder:
                 seq_ids = tf.range(seq_length)
-                causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
-                                            seq_ids[None, :, None])
+                causal_mask = tf.less_equal(
+                    tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None]
+                )
                 causal_mask = tf.cast(causal_mask, dtype=tf.float32)
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
@@ -460,14 +469,16 @@ class TFT5MainLayer(tf.keras.layers.Layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states,
-                                         attention_mask=extended_attention_mask,
-                                         position_bias=position_bias,
-                                         encoder_hidden_states=encoder_hidden_states,
-                                         encoder_attention_mask=encoder_extended_attention_mask,
-                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
-                                         head_mask=head_mask[i],
-                                         training=training)
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask[i],
+                training=training,
+            )
             hidden_states = layer_outputs[0]
             if i == 0:
                 # We share the position biases between the layers - the first layer store them
@@ -505,6 +516,7 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = T5Config
     pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -513,9 +525,11 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     def dummy_inputs(self):
         input_ids = tf.constant(DUMMY_INPUTS)
         input_mask = tf.constant(DUMMY_MASK)
-        dummy_inputs = {'decoder_input_ids': input_ids,
-                        'encoder_input_ids': input_ids,
-                        'decoder_attention_mask': input_mask}
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "encoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
         return dummy_inputs
 
 
@@ -586,9 +600,12 @@ T5_INPUTS_DOCSTRING = r"""
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
-                      "without any specific head on top.",
-                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+    T5_INPUTS_DOCSTRING,
+)
 class TFT5Model(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -614,17 +631,17 @@ class TFT5Model(TFT5PreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFT5Model, self).__init__(config, *inputs, **kwargs)
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
-                                         name='shared')
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
 
         encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
 
         decoder_config = copy.deepcopy(config)
         decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
 
     def get_input_embeddings(self):
         return self.shared
@@ -641,14 +658,15 @@ class TFT5Model(TFT5PreTrainedModel):
         if isinstance(decoder_input_ids, dict):
             kwargs.update(decoder_input_ids)
         else:
-            kwargs['decoder_input_ids'] = decoder_input_ids
+            kwargs["decoder_input_ids"] = decoder_input_ids
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -678,8 +696,7 @@ class TFT5Model(TFT5PreTrainedModel):
         return decoder_outputs + encoder_outputs
 
 
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
-    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class TFT5WithLMHeadModel(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -705,19 +722,19 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         prediction_scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
         self.model_dim = config.d_model
 
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
-                                         name='shared')
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
 
         encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
 
         decoder_config = copy.deepcopy(config)
         decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
 
     def get_input_embeddings(self):
         return self.shared
@@ -734,14 +751,15 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         if isinstance(decoder_input_ids, dict):
             kwargs.update(decoder_input_ids)
         else:
-            kwargs['decoder_input_ids'] = decoder_input_ids
+            kwargs["decoder_input_ids"] = decoder_input_ids
 
-        kwargs_common = dict((k, v) for k, v in kwargs.items()
-                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_common = dict(
+            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
+        )
         kwargs_encoder = kwargs_common.copy()
         kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index 08bbe7403..fc7ea932a 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -37,9 +37,10 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
 }
 
+
 class TFPositionalEmbedding(tf.keras.layers.Layer):
     def __init__(self, demb, **kwargs):
         super(TFPositionalEmbedding, self).__init__(**kwargs)
@@ -47,7 +48,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
         self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
 
     def call(self, pos_seq, bsz=None):
-        sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
+        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
         pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
 
         if bsz is not None:
@@ -64,17 +65,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
         self.d_inner = d_inner
         self.dropout = dropout
 
-        self.layer_1 = tf.keras.layers.Dense(d_inner,
-                                             kernel_initializer=get_initializer(init_std),
-                                             activation=tf.nn.relu,
-                                             name='CoreNet_._0')
+        self.layer_1 = tf.keras.layers.Dense(
+            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
+        )
         self.drop_1 = tf.keras.layers.Dropout(dropout)
-        self.layer_2 = tf.keras.layers.Dense(d_model,
-                                             kernel_initializer=get_initializer(init_std),
-                                             name='CoreNet_._3')
+        self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
         self.drop_2 = tf.keras.layers.Dropout(dropout)
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
 
         self.pre_lnorm = pre_lnorm
 
@@ -103,10 +101,24 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
 
 
 class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None, output_attentions=False, 
-                 layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        **kwargs
+    ):
         super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
 
         self.output_attentions = output_attentions
@@ -115,46 +127,41 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         self.d_head = d_head
         self.dropout = dropout
 
-        self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
-                                             kernel_initializer=get_initializer(init_std),
-                                             use_bias=False,
-                                             name='qkv_net')
+        self.qkv_net = tf.keras.layers.Dense(
+            3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
+        )
 
         self.drop = tf.keras.layers.Dropout(dropout)
         self.dropatt = tf.keras.layers.Dropout(dropatt)
-        self.o_net = tf.keras.layers.Dense(d_model,
-                                           kernel_initializer=get_initializer(init_std),
-                                           use_bias=False,
-                                           name='o_net')
+        self.o_net = tf.keras.layers.Dense(
+            d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
+        )
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
 
         self.scale = 1 / (d_head ** 0.5)
 
         self.pre_lnorm = pre_lnorm
 
-        if r_r_bias is not None and r_w_bias is not None: # Biases are shared
+        if r_r_bias is not None and r_w_bias is not None:  # Biases are shared
             self.r_r_bias = r_r_bias
             self.r_w_bias = r_w_bias
         else:
             self.r_r_bias = None
             self.r_w_bias = None
 
-        self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
-                                           kernel_initializer=get_initializer(init_std),
-                                           use_bias=False,
-                                           name='r_net')
+        self.r_net = tf.keras.layers.Dense(
+            self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
+        )
 
     def build(self, input_shape):
-        if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
-            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_r_bias')
-            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_w_bias')
+        if self.r_r_bias is None or self.r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
         super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
 
     def _rel_shift(self, x):
@@ -196,14 +203,14 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
         w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
 
-        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))       # qlen x n_head x d_head
+        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
-        AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)                  # qlen x klen x bsz x n_head
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
 
         rr_head_q = w_head_q + self.r_r_bias
-        BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)                   # qlen x klen x bsz x n_head
+        BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k)  # qlen x klen x bsz x n_head
         BD = self._rel_shift(BD)
 
         # [qlen x klen x bsz x n_head]
@@ -224,12 +231,11 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
             attn_prob = attn_prob * head_mask
 
         #### compute attention vector
-        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
 
         # [qlen x bsz x n_head x d_head]
         attn_vec_sizes = shape_list(attn_vec)
-        attn_vec = tf.reshape(attn_vec, 
-                        (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
+        attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
 
         ##### linear projection
         attn_out = self.o_net(attn_vec)
@@ -249,32 +255,57 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
 
 
 class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
-                 tgt_len=None, ext_len=None, mem_len=None,
-                 dropatt=0., pre_lnorm=False,
-                 r_w_bias=None,
-                 r_r_bias=None,
-                 output_attentions=False,
-                 layer_norm_epsilon=1e-5,
-                 init_std=0.02,
-                 **kwargs):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        d_inner,
+        dropout,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_w_bias=None,
+        r_r_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        **kwargs
+    ):
         super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
 
-        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
-                            d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
-                            mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
-                            r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
-                            output_attentions=output_attentions,
-                            layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
-        self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, 
-                                       pre_lnorm=pre_lnorm, init_std=init_std,
-                                       layer_norm_epsilon=layer_norm_epsilon,
-                                       name='pos_ff')
+        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
+            n_head,
+            d_model,
+            d_head,
+            dropout,
+            tgt_len=tgt_len,
+            ext_len=ext_len,
+            mem_len=mem_len,
+            dropatt=dropatt,
+            pre_lnorm=pre_lnorm,
+            r_w_bias=r_w_bias,
+            r_r_bias=r_r_bias,
+            init_std=init_std,
+            output_attentions=output_attentions,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="dec_attn",
+        )
+        self.pos_ff = TFPositionwiseFF(
+            d_model,
+            d_inner,
+            dropout,
+            pre_lnorm=pre_lnorm,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="pos_ff",
+        )
 
     def call(self, inputs, training=False):
         dec_inp, r, dec_attn_mask, mems, head_mask = inputs
-        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
-                                      mems, head_mask], training=training)
+        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training)
         ff_output = self.pos_ff(attn_outputs[0], training=training)
 
         outputs = [ff_output] + attn_outputs[1:]
@@ -283,8 +314,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
 
 
 class TFAdaptiveEmbedding(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
-                 sample_softmax=False, **kwargs):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
         super(TFAdaptiveEmbedding, self).__init__(**kwargs)
 
         self.n_token = n_token
@@ -305,20 +335,28 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
-                                                                 d_emb_i,
-                                                                 embeddings_initializer=get_initializer(init_std),
-                                                                 name='emb_layers_._{}'.format(i)))
+                self.emb_layers.append(
+                    tf.keras.layers.Embedding(
+                        r_idx - l_idx,
+                        d_emb_i,
+                        embeddings_initializer=get_initializer(init_std),
+                        name="emb_layers_._{}".format(i),
+                    )
+                )
 
     def build(self, input_shape):
         for i in range(len(self.cutoffs)):
             d_emb_i = self.d_embed // (self.div_val ** i)
-            self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
-                                                  initializer=get_initializer(self.init_std),
-                                                  trainable=True,
-                                                  name='emb_projs_._{}'.format(i)))
+            self.emb_projs.append(
+                self.add_weight(
+                    shape=(d_emb_i, self.d_proj),
+                    initializer=get_initializer(self.init_std),
+                    trainable=True,
+                    name="emb_projs_._{}".format(i),
+                )
+            )
         super(TFAdaptiveEmbedding, self).build(input_shape)
 
     def call(self, inp):
@@ -334,7 +372,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
 
                 inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
                 emb_i = self.emb_layers[i](inp_i)
-                emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
+                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
 
                 mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
                 emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
@@ -361,8 +399,15 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.d_head = config.d_head
         self.untie_r = config.untie_r
 
-        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
-                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')
+        self.word_emb = TFAdaptiveEmbedding(
+            config.vocab_size,
+            config.d_embed,
+            config.d_model,
+            config.cutoffs,
+            div_val=config.div_val,
+            init_std=config.init_std,
+            name="word_emb",
+        )
 
         self.drop = tf.keras.layers.Dropout(config.dropout)
 
@@ -376,41 +421,47 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.attn_type = config.attn_type
 
         self.layers = []
-        if config.attn_type == 0: # the default attention
+        if config.attn_type == 0:  # the default attention
             for i in range(config.n_layer):
                 self.layers.append(
                     TFRelPartialLearnableDecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        tgt_len=config.tgt_len,
+                        ext_len=config.ext_len,
+                        mem_len=config.mem_len,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if self.untie_r else self.r_w_bias,
                         r_r_bias=None if self.untie_r else self.r_r_bias,
                         output_attentions=self.output_attentions,
                         layer_norm_epsilon=config.layer_norm_epsilon,
                         init_std=config.init_std,
-                        name='layers_._{}'.format(i))
+                        name="layers_._{}".format(i),
+                    )
                 )
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
-        if self.attn_type == 0: # default attention
-            self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
-        else: # learnable embeddings and absolute embeddings
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
     def build(self, input_shape):
         if not self.untie_r:
-            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_w_bias')
-            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                            initializer='zeros',
-                                            trainable=True,
-                                            name='r_r_bias')
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
         super(TFTransfoXLMainLayer, self).build(input_shape)
 
     def get_input_embeddings(self):
@@ -443,10 +494,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
 
     def _update_mems(self, hids, mems, qlen, mlen):
         # does not deal with None
-        if mems is None: return None
+        if mems is None:
+            return None
 
         # mems is not None
-        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
 
         # There are `mlen + qlen` steps that can be cached into mems
         # For the next step, the last `ext_len` of the `qlen` tokens
@@ -472,10 +524,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
             assert len(inputs) <= 4, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            mems = inputs.get('mems', mems)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            mems = inputs.get("mems", mems)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 4, "Too many inputs."
         else:
             input_ids = inputs
@@ -521,8 +573,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
         if self.same_length:
             mask_l = tf.linalg.band_part(attn_mask, -1, 0)
-            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
-                                       dec_attn_mask[:, qlen:]], 1)
+            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1)
         # ::: PyTorch masking code for reference :::
         # if self.same_length:
         #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
@@ -539,8 +590,8 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
 
         hids = []
         attentions = []
-        if self.attn_type == 0: # default
-            pos_seq = tf.range(klen-1, -1, -1.0)
+        if self.attn_type == 0:  # default
+            pos_seq = tf.range(klen - 1, -1, -1.0)
             if self.clamp_len > 0:
                 pos_seq = tf.minimum(pos_seq, self.clamp_len)
             pos_emb = self.pos_emb(pos_seq)
@@ -551,12 +602,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
             for i, layer in enumerate(self.layers):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
-                layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
-                                       mems_i, head_mask[i]], training=training)
+                layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training)
                 core_out = layer_outputs[0]
                 if self.output_attentions:
                     attentions.append(layer_outputs[1])
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         core_out = self.drop(core_out, training=training)
@@ -581,6 +631,7 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = TransfoXLConfig
     pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -647,8 +698,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -678,18 +733,22 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
         last_hidden_states, mems = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+@add_start_docstrings(
+    """The Transformer-XL Model with a language modeling head on top
     (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -720,17 +779,19 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
         prediction_scores, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TFTransfoXLLMHeadModel, self).__init__(config)
-        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
         self.sample_softmax = config.sample_softmax
         # use sampled softmax
         if config.sample_softmax > 0:
             raise NotImplementedError
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
-                                              config.cutoffs, div_val=config.div_val, name='crit')
+            self.crit = TFAdaptiveSoftmaxMask(
+                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
+            )
 
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.transformer.reset_length(tgt_len, ext_len, mem_len)
@@ -747,11 +808,11 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             labels = inputs[4] if len(inputs) > 4 else labels
             assert len(inputs) <= 5, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            mems = inputs.get('mems', mems)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
-            labels = inputs.get('labels', labels)
+            input_ids = inputs.get("input_ids")
+            mems = inputs.get("mems", mems)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            labels = inputs.get("labels", labels)
             assert len(inputs) <= 5, "Too many inputs."
         else:
             input_ids = inputs
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index f730af851..0f2a4ebeb 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -24,9 +24,9 @@ import tensorflow as tf
 
 from .modeling_tf_utils import shape_list
 
+
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
-                 keep_order=False, **kwargs):
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
         super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
 
         self.vocab_size = vocab_size
@@ -47,52 +47,59 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         if self.n_clusters > 0:
-            self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
-                                                  initializer='zeros',
-                                                  trainable=True,
-                                                  name='cluster_weight')
-            self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
-                                                initializer='zeros',
-                                                trainable=True,
-                                                name='cluster_bias')
+            self.cluster_weight = self.add_weight(
+                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
+            )
+            self.cluster_bias = self.add_weight(
+                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
+            )
 
         if self.div_val == 1:
             for i in range(len(self.cutoffs)):
                 if self.d_proj != self.d_embed:
-                    weight = self.add_weight(shape=(self.d_embed, self.d_proj),
-                                             initializer='zeros',
-                                             trainable=True,
-                                             name='out_projs_._{}'.format(i))
+                    weight = self.add_weight(
+                        shape=(self.d_embed, self.d_proj),
+                        initializer="zeros",
+                        trainable=True,
+                        name="out_projs_._{}".format(i),
+                    )
                     self.out_projs.append(weight)
                 else:
                     self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.vocab_size,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._bias'.format(i))
+                weight = self.add_weight(
+                    shape=(self.vocab_size, self.d_embed,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._weight".format(i),
+                )
+                bias = self.add_weight(
+                    shape=(self.vocab_size,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._bias".format(i),
+                )
                 self.out_layers.append((weight, bias))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = self.d_embed // (self.div_val ** i)
 
-                weight = self.add_weight(shape=(d_emb_i, self.d_proj),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_projs_._{}'.format(i))
+                weight = self.add_weight(
+                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i)
+                )
                 self.out_projs.append(weight)
-                weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(r_idx-l_idx,),
-                                         initializer='zeros',
-                                         trainable=True,
-                                         name='out_layers_._{}_._bias'.format(i))
+                weight = self.add_weight(
+                    shape=(r_idx - l_idx, d_emb_i,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._weight".format(i),
+                )
+                bias = self.add_weight(
+                    shape=(r_idx - l_idx,),
+                    initializer="zeros",
+                    trainable=True,
+                    name="out_layers_._{}_._bias".format(i),
+                )
                 self.out_layers.append((weight, bias))
         super(TFAdaptiveSoftmaxMask, self).build(input_shape)
 
@@ -100,8 +107,8 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
     def _logit(x, W, b, proj=None):
         y = x
         if proj is not None:
-            y = tf.einsum('ibd,ed->ibe', y, proj)
-        return tf.einsum('ibd,nd->ibn', y, W) + b
+            y = tf.einsum("ibd,ed->ibe", y, proj)
+        return tf.einsum("ibd,nd->ibn", y, W) + b
 
     @staticmethod
     def _gather_logprob(logprob, target):
@@ -114,7 +121,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
         hidden, target = inputs
         head_logprob = 0
         if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable("bias", [self.config.vocab_size], initializer=tf.zeros_initializer())
             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
             if target is not None:
                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
@@ -143,7 +150,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
                     head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
                     head_logprob = tf.nn.log_softmax(head_logit)
-                    out.append(head_logprob[..., :self.cutoffs[0]])
+                    out.append(head_logprob[..., : self.cutoffs[0]])
                     if target is not None:
                         cur_head_logprob = tf.boolean_mask(head_logprob, mask)
                         cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
@@ -170,6 +177,6 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
 
             # Log the loss as a metric (we could log arbitrary metrics,
             # including different metrics for training and inference.
-            self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
+            self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
 
         return out
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 0aa65a9f1..7ecd79afd 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """TF general model utils."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,12 +25,20 @@ from tensorflow.python.keras.saving import hdf5_format
 import h5py
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
-                         cached_path, hf_bucket_url, is_remote_url)
+from .file_utils import (
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    DUMMY_INPUTS,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
 
+
 class TFPreTrainedModel(tf.keras.Model):
     r""" Base class for all TF models.
 
@@ -60,7 +67,7 @@ class TFPreTrainedModel(tf.keras.Model):
         Returns:
             tf.Tensor with dummy inputs
         """
-        return {'input_ids': tf.constant(DUMMY_INPUTS)}
+        return {"input_ids": tf.constant(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -70,7 +77,8 @@ class TFPreTrainedModel(tf.keras.Model):
                 "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
-                ))
+                )
+            )
         # Save config in model
         self.config = config
 
@@ -151,7 +159,9 @@ class TFPreTrainedModel(tf.keras.Model):
         """ Save a model and its configuration file to a directory, so that it
             can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # Save configuration file
         self.config.save_pretrained(save_directory)
@@ -230,20 +240,22 @@ class TFPreTrainedModel(tf.keras.Model):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
 
         """
-        config = kwargs.pop('config', None)
-        cache_dir = kwargs.pop('cache_dir', None)
-        from_pt = kwargs.pop('from_pt', False)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        output_loading_info = kwargs.pop('output_loading_info', False)
+        config = kwargs.pop("config", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_pt = kwargs.pop("from_pt", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
 
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                config_path, *model_args,
-                cache_dir=cache_dir, return_unused_kwargs=True,
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
                 **kwargs
@@ -263,9 +275,11 @@ class TFPreTrainedModel(tf.keras.Model):
                     # Load from a PyTorch checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
                 else:
-                    raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
-                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
-                        pretrained_model_name_or_path))
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_pt` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
+                        )
+                    )
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
@@ -273,31 +287,37 @@ class TFPreTrainedModel(tf.keras.Model):
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
                 if from_pt:
-                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
+                    raise EnvironmentError(
+                        "Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name."
+                    )
 
             # redirect to the cache, if necessary
             try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
-                                                    resume_download=resume_download, proxies=proxies)
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                )
             except EnvironmentError as e:
                 if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    logger.error(
-                        "Couldn't reach server at '{}' to download pretrained weights.".format(
-                            archive_file))
+                    logger.error("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
                 else:
                     logger.error(
                         "Model name '{}' was not found in model name list ({}). "
                         "We assumed '{}' was a path or url but couldn't find any file "
                         "associated to this path or url.".format(
                             pretrained_model_name_or_path,
-                            ', '.join(cls.pretrained_model_archive_map.keys()),
-                            archive_file))
+                            ", ".join(cls.pretrained_model_archive_map.keys()),
+                            archive_file,
+                        )
+                    )
                 raise e
             if resolved_archive_file == archive_file:
                 logger.info("loading weights file {}".format(archive_file))
             else:
-                logger.info("loading weights file {} from cache at {}".format(
-                    archive_file, resolved_archive_file))
+                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
         else:
             resolved_archive_file = None
 
@@ -316,38 +336,42 @@ class TFPreTrainedModel(tf.keras.Model):
         try:
             model.load_weights(resolved_archive_file, by_name=True)
         except OSError:
-            raise OSError("Unable to load weights from h5 file. "
-                          "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. ")
+            raise OSError(
+                "Unable to load weights from h5 file. "
+                "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
+            )
 
         ret = model(model.dummy_inputs, training=False)  # Make sure restore ops are run
 
         # Check if the models are the same to output loading informations
-        with h5py.File(resolved_archive_file, 'r') as f:
-            if 'layer_names' not in f.attrs and 'model_weights' in f:
-                f = f['model_weights']
-            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, 'layer_names'))
+        with h5py.File(resolved_archive_file, "r") as f:
+            if "layer_names" not in f.attrs and "model_weights" in f:
+                f = f["model_weights"]
+            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
         model_layer_names = set(layer.name for layer in model.layers)
         missing_keys = list(model_layer_names - hdf5_layer_names)
         unexpected_keys = list(hdf5_layer_names - model_layer_names)
         error_msgs = []
 
         if len(missing_keys) > 0:
-            logger.info("Layers of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
+            logger.info(
+                "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
         if len(unexpected_keys) > 0:
-            logger.info("Layers from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
+            logger.info(
+                "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
         if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading weights for {}:\n\t{}'.format(
-                            model.__class__.__name__, "\n\t".join(error_msgs)))
+            raise RuntimeError(
+                "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
         if output_loading_info:
-            loading_info = {"missing_keys": missing_keys,
-                            "unexpected_keys": unexpected_keys,
-                            "error_msgs": error_msgs}
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
             return model, loading_info
 
         return model
 
+
 class TFConv1D(tf.keras.layers.Layer):
     def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
         """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
@@ -360,13 +384,9 @@ class TFConv1D(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.nx, self.nf],
-            initializer=get_initializer(self.initializer_range))
-        self.bias = self.add_weight(
-            "bias",
-            shape=[1, self.nf],
-            initializer=tf.zeros_initializer())
+            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+        )
+        self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
     def call(self, x):
         bz, sl = shape_list(x)[:2]
@@ -382,11 +402,12 @@ class TFConv1D(tf.keras.layers.Layer):
 class TFSharedEmbeddings(tf.keras.layers.Layer):
     """Construct shared token embeddings.
     """
+
     def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
         super(TFSharedEmbeddings, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
-        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
 
     def build(self, input_shape):
         """Build shared word embedding layer
@@ -394,9 +415,8 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
         self.weight = self.add_weight(
-            "weight",
-            shape=[self.vocab_size, self.hidden_size],
-            initializer=get_initializer(self.initializer_range))
+            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+        )
         super(TFSharedEmbeddings, self).build(input_shape)
 
     def call(self, inputs, mode="embedding"):
@@ -455,35 +475,36 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             summary_first_dropout: Add a dropout before the projection and activation
             summary_last_dropout: Add a dropout after the projection and activation
     """
+
     def __init__(self, config, initializer_range=0.02, **kwargs):
         super(TFSequenceSummary, self).__init__(**kwargs)
 
-        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
-        if self.summary_type == 'attn':
+        self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
+        if self.summary_type == "attn":
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
             raise NotImplementedError
 
-        self.has_summary = hasattr(config, 'summary_use_proj') and config.summary_use_proj
+        self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
         if self.has_summary:
-            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                 num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
-            self.summary = tf.keras.layers.Dense(num_classes,
-                                                    kernel_initializer=get_initializer(initializer_range),
-                                                    name='summary')
+            self.summary = tf.keras.layers.Dense(
+                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+            )
 
-        self.has_activation = hasattr(config, 'summary_activation') and config.summary_activation == 'tanh'
+        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
         if self.has_activation:
             self.activation = tf.keras.activations.tanh
 
-        self.has_first_dropout = hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0
+        self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
         if self.has_first_dropout:
             self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
 
-        self.has_last_dropout = hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0
+        self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
         if self.has_last_dropout:
             self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
 
@@ -502,29 +523,33 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             cls_index = inputs[1] if len(inputs) > 1 else None
             assert len(inputs) <= 2, "Too many inputs."
         else:
-            input_ids = inputs.get('input_ids')
-            cls_index = inputs.get('cls_index', None)
+            input_ids = inputs.get("input_ids")
+            cls_index = inputs.get("cls_index", None)
 
-        if self.summary_type == 'last':
+        if self.summary_type == "last":
             output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
+        elif self.summary_type == "first":
             output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
+        elif self.summary_type == "mean":
             output = tf.reduce_mean(hidden_states, axis=1)
-        elif self.summary_type == 'cls_index':
+        elif self.summary_type == "cls_index":
             hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
             if cls_index is None:
-                cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1)  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
+                cls_index = tf.fill(
+                    hidden_shape[:-2], hidden_shape[-2] - 1
+                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
             cls_shape = shape_list(cls_index)
             if len(cls_shape) <= len(hidden_shape) - 2:
                 cls_index = cls_index[..., tf.newaxis]
             # else:
-                # cls_index = cls_index[..., tf.newaxis]
-                # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # cls_index = cls_index[..., tf.newaxis]
+            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
             output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
-            output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size)
-        elif self.summary_type == 'attn':
+            output = tf.squeeze(
+                output, axis=len(hidden_shape) - 2
+            )  # shape of output: (batch, num choices, hidden_size)
+        elif self.summary_type == "attn":
             raise NotImplementedError
 
         if self.has_first_dropout:
@@ -541,12 +566,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
 
         return output
 
+
 def shape_list(x):
     """Deal with dynamic shape in tensorflow cleanly."""
     static = x.shape.as_list()
     dynamic = tf.shape(x)
     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 
+
 def get_initializer(initializer_range=0.02):
     """Creates a `tf.initializers.truncated_normal` with the given range.
     Args:
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index a7cc8ea48..2f443ae2f 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -25,30 +25,34 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer, DUMMY_INPUTS
+from .modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSharedEmbeddings,
+    TFSequenceSummary,
+    shape_list,
+    get_initializer,
+    DUMMY_INPUTS,
+)
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
 }
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
 
@@ -78,8 +82,9 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
     if causal:
-        attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)),
-                                  alen[tf.newaxis, :, tf.newaxis])
+        attn_mask = tf.less_equal(
+            tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis]
+        )
     else:
         attn_mask = mask
 
@@ -106,10 +111,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         self.n_heads = n_heads
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
-        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
-        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
-        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
+        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
         self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
         self.pruned_heads = set()
 
@@ -125,7 +130,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = shape_list(input)
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = shape_list(kv)[1]
         # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
@@ -141,40 +146,40 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
             """  compute context """
             return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
 
-        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
-        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
-        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
         # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
         scores = scores - 1e30 * (1.0 - mask)
 
-        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         outputs = (self.out_lin(context),)
         if self.output_attentions:
@@ -183,11 +188,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
 
 
 class TFTransformerFFN(tf.keras.layers.Layer):
-
     def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
         super(TFTransformerFFN, self).__init__(**kwargs)
-        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
-        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
+        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
         self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
@@ -226,30 +230,36 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
-        self.dim = config.emb_dim       # 512 by default
+        self.dim = config.emb_dim  # 512 by default
         self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads   # 8 by default
+        self.n_heads = config.n_heads  # 8 by default
         self.n_layers = config.n_layers
-        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
 
         # embeddings
         self.dropout = tf.keras.layers.Dropout(config.dropout)
         self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
 
-        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
-                                                             self.dim,
-                                                             embeddings_initializer=get_initializer(config.embed_init_std),
-                                                             name='position_embeddings')
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            self.dim,
+            embeddings_initializer=get_initializer(config.embed_init_std),
+            name="position_embeddings",
+        )
         if config.sinusoidal_embeddings:
             raise NotImplementedError
             # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
         if config.n_langs > 1 and config.use_lang_emb:
-            self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
-                                                             self.dim,
-                                                             embeddings_initializer=get_initializer(config.embed_init_std),
-                                                             name='lang_embeddings')
-        self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings')  # padding_idx=self.pad_index)
-        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
+            self.lang_embeddings = tf.keras.layers.Embedding(
+                self.n_langs,
+                self.dim,
+                embeddings_initializer=get_initializer(config.embed_init_std),
+                name="lang_embeddings",
+            )
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )  # padding_idx=self.pad_index)
+        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
 
         # transformer layers
         self.attentions = []
@@ -261,13 +271,21 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         #     self.encoder_attn = []
 
         for i in range(self.n_layers):
-            self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i)))
-            self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i)))
+            self.attentions.append(
+                TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i))
+            )
+            self.layer_norm1.append(
+                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i))
+            )
             # if self.is_decoder:
             #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
             #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i)))
-            self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i)))
+            self.ffns.append(
+                TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i))
+            )
+            self.layer_norm2.append(
+                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i))
+            )
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
@@ -276,7 +294,6 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
                 if self.attentions[int(layer)].n_heads == config.n_heads:
                     self.prune_heads({int(layer): list(map(int, heads))})
 
-
     def get_input_embeddings(self):
         return self.embeddings
 
@@ -290,9 +307,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         """
         raise NotImplementedError
 
-    def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
-             position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None,
-             training=False):  # removed: src_enc=None, src_len=None
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):  # removed: src_enc=None, src_len=None
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -305,15 +332,15 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
             assert len(inputs) <= 9, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            langs = inputs.get('langs', langs)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            lengths = inputs.get('lengths', lengths)
-            cache = inputs.get('cache', cache)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            langs = inputs.get("langs", langs)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            lengths = inputs.get("lengths", lengths)
+            cache = inputs.get("cache", cache)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 9, "Too many inputs."
         else:
             input_ids = inputs
@@ -331,7 +358,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
             if input_ids is not None:
                 lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
             else:
-                lengths = tf.convert_to_tensor([slen]*bs, tf.int32)
+                lengths = tf.convert_to_tensor([slen] * bs, tf.int32)
         # mask = input_ids != self.pad_index
 
         # check inputs
@@ -375,7 +402,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
         # do not recompute cached elements
         if cache is not None and input_ids is not None:
-            _slen = slen - cache['slen']
+            _slen = slen - cache["slen"]
             input_ids = input_ids[:, -_slen:]
             position_ids = position_ids[:, -_slen:]
             if langs is not None:
@@ -430,7 +457,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
         # update cache length
         if cache is not None:
-            cache['slen'] += tensor.size(1)
+            cache["slen"] += tensor.size(1)
 
         # move back sequence length to dimension 0
         # tensor = tensor.transpose(0, 1)
@@ -447,6 +474,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLMConfig
     pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -460,7 +488,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
             langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
 
 
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in
@@ -554,8 +582,12 @@ XLM_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
-                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMModel(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -581,20 +613,21 @@ class TFXLMModel(TFXLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-
 class TFXLMPredLayer(tf.keras.layers.Layer):
     """
     Prediction layer (cross_entropy or adaptive_softmax).
     """
+
     def __init__(self, config, input_embeddings, **kwargs):
         super(TFXLMPredLayer, self).__init__(**kwargs)
         self.asm = config.asm
@@ -614,10 +647,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
-        self.bias = self.add_weight(shape=(self.n_words,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
         super(TFXLMPredLayer, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -626,9 +656,12 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
         return hidden_states
 
 
-@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+@add_start_docstrings(
+    """The XLM Model transformer with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -654,10 +687,11 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
 
     def get_output_embeddings(self):
         return self.pred_layer.input_embeddings
@@ -672,9 +706,12 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -701,12 +738,13 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -718,9 +756,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -748,12 +789,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.init_std),
-                                                name='qa_outputs')
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -765,6 +807,8 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
 
-        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (start_logits, end_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 2f1fe150c..c1ed720f9 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -35,8 +35,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
 }
 
 
@@ -45,8 +45,7 @@ def gelu(x):
         XLNet is using OpenAI GPT's gelu
         Also see https://arxiv.org/abs/1606.08415
     """
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
 
@@ -54,9 +53,11 @@ def swish(x):
     return x * tf.sigmoid(x)
 
 
-ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
-          "relu": tf.keras.activations.relu,
-          "swish": tf.keras.layers.Activation(swish)}
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+}
 
 
 class TFXLNetRelativeAttention(tf.keras.layers.Layer):
@@ -67,7 +68,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head))
+                "heads (%d)" % (config.d_model, config.n_head)
+            )
 
         self.n_head = config.n_head
         self.d_head = config.d_head
@@ -75,38 +77,38 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         self.scale = 1 / (config.d_head ** 0.5)
         self.initializer_range = config.initializer_range
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def build(self, input_shape):
         initializer = get_initializer(self.initializer_range)
-        self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='q')
-        self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='k')
-        self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='v')
-        self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='o')
-        self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
-                                 initializer=initializer,
-                                 trainable=True, name='r')
-        self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_r_bias')
-        self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_s_bias')
-        self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
-                                        initializer='zeros',
-                                        trainable=True, name='r_w_bias')
-        self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
-                                        initializer=initializer,
-                                        trainable=True, name='seg_embed')
+        self.q = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
+        )
+        self.k = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
+        )
+        self.v = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
+        )
+        self.o = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
+        )
+        self.r = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
+        )
+        self.r_r_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+        )
+        self.r_s_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
+        )
+        self.r_w_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+        )
+        self.seg_embed = self.add_weight(
+            shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
+        )
         super(TFXLNetRelativeAttention, self).build(input_shape)
 
     def prune_heads(self, heads):
@@ -130,18 +132,18 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
 
         # content based attention score
-        ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
+        ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)
 
         # position based attention score
-        bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
+        bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
         bd = self.rel_shift(bd, klen=shape_list(ac)[1])
 
         # segment based attention score
         if seg_mat is None:
             ef = 0
         else:
-            ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
-            ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef)
+            ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)
 
         # merge attention scores and perform masking
         attn_score = (ac + bd + ef) * self.scale
@@ -162,7 +164,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             attn_prob = attn_prob * head_mask
 
         # attention output
-        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)
 
         if self.output_attentions:
             return attn_vec, attn_prob
@@ -174,7 +176,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         # post-attention projection (back to `d_model`)
         h, attn_vec = inputs
 
-        attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+        attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)
 
         attn_out = self.dropout(attn_out, training=training)
 
@@ -185,8 +187,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         return output
 
     def call(self, inputs, training=False):
-        (h, g, attn_mask_h, attn_mask_g,
-         r, seg_mat, mems, target_mapping, head_mask) = inputs
+        (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs
 
         if g is not None:
             ###### Two-stream attention with relative positional encoding.
@@ -197,22 +198,22 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                 cat = h
 
             # content-based key head
-            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
 
             # content-based value head
-            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # position-based key head
-            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
 
             ##### h-stream
             # content-stream query head
-            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
 
             # core attention ops
             attn_vec_h = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
-                training=training)
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
+            )
 
             if self.output_attentions:
                 attn_vec_h, attn_prob_h = attn_vec_h
@@ -222,23 +223,23 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
 
             ##### g-stream
             # query-stream query head
-            q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q)
+            q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q)
 
             # core attention ops
             if target_mapping is not None:
-                q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
                 attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
-                    training=training)
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
 
-                attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+                attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
             else:
                 attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
-                    training=training)
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
@@ -257,17 +258,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                 cat = h
 
             # content heads
-            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
-            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
-            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # positional heads
-            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
 
             # core attention ops
             attn_vec = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
-                training=training)
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
+            )
 
             if self.output_attentions:
                 attn_vec, attn_prob = attn_vec
@@ -281,19 +282,21 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             outputs = outputs + (attn_prob,)
         return outputs
 
+
 class TFXLNetFeedForward(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXLNetFeedForward, self).__init__(**kwargs)
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
-        self.layer_1 = tf.keras.layers.Dense(config.d_inner,
-                                             kernel_initializer=get_initializer(config.initializer_range),
-                                             name='layer_1')
-        self.layer_2 = tf.keras.layers.Dense(config.d_model,
-                                             kernel_initializer=get_initializer(config.initializer_range),
-                                             name='layer_2')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.layer_1 = tf.keras.layers.Dense(
+            config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
+        )
+        self.layer_2 = tf.keras.layers.Dense(
+            config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
+        )
         self.dropout = tf.keras.layers.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or \
-                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+        if isinstance(config.ff_activation, str) or (
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+        ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
             self.activation_function = config.ff_activation
@@ -308,11 +311,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
         output = self.layer_norm(output + inp)
         return output
 
+
 class TFXLNetLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXLNetLayer, self).__init__(**kwargs)
-        self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn')
-        self.ff = TFXLNetFeedForward(config, name='ff')
+        self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
+        self.ff = TFXLNetFeedForward(config, name="ff")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def call(self, inputs, training=False):
@@ -336,10 +340,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
         self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,),
-                                    initializer='zeros',
-                                    trainable=True,
-                                    name='bias')
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super(TFXLNetLMHead, self).build(input_shape)
 
     def call(self, hidden_states):
@@ -366,8 +367,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         self.use_bfloat16 = config.use_bfloat16
         self.initializer_range = config.initializer_range
 
-        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
-        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
+        self.word_embedding = TFSharedEmbeddings(
+            config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
+        )
+        self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)]
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
     def get_input_embeddings(self):
@@ -375,9 +378,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
     def build(self, input_shape):
         initializer = get_initializer(self.initializer_range)
-        self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
-                                 initializer=initializer,
-                                 trainable=True, name='mask_emb')
+        self.mask_emb = self.add_weight(
+            shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
+        )
 
     def _resize_token_embeddings(self, new_num_tokens):
         raise NotImplementedError
@@ -417,18 +420,18 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
     def cache_mem(self, curr_out, prev_mem):
         """cache hidden states into memory."""
         if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[:self.reuse_len]
+            curr_out = curr_out[: self.reuse_len]
 
         if prev_mem is None:
-            new_mem = curr_out[-self.mem_len:]
+            new_mem = curr_out[-self.mem_len :]
         else:
-            new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:]
+            new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :]
 
         return tf.stop_gradient(new_mem)
 
     @staticmethod
     def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
+        sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
         pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
         pos_emb = pos_emb[:, None, :]
 
@@ -444,14 +447,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             freq_seq = tf.cast(freq_seq, dtype=dtype)
         inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
 
-        if self.attn_type == 'bi':
+        if self.attn_type == "bi":
             # beg, end = klen - 1, -qlen
             beg, end = klen, -qlen
-        elif self.attn_type == 'uni':
+        elif self.attn_type == "uni":
             # beg, end = klen - 1, -1
             beg, end = klen, -1
         else:
-            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
 
         if self.bi_data:
             fwd_pos_seq = tf.range(beg, end, -1.0)
@@ -467,9 +470,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
             if bsz is not None:
                 # With bi_data, the batch size should be divisible by 2.
-                assert bsz%2 == 0
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+                assert bsz % 2 == 0
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
             else:
                 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
                 bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
@@ -485,8 +488,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         return pos_emb
 
-    def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-            token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
             attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -499,15 +513,15 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
             assert len(inputs) <= 9, "Too many inputs."
         elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            mems = inputs.get('mems', mems)
-            perm_mask = inputs.get('perm_mask', perm_mask)
-            target_mapping = inputs.get('target_mapping', target_mapping)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            input_mask = inputs.get('input_mask', input_mask)
-            head_mask = inputs.get('head_mask', head_mask)
-            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            mems = inputs.get("mems", mems)
+            perm_mask = inputs.get("perm_mask", perm_mask)
+            target_mapping = inputs.get("target_mapping", target_mapping)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            input_mask = inputs.get("input_mask", input_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             assert len(inputs) <= 9, "Too many inputs."
         else:
             input_ids = inputs
@@ -540,17 +554,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         ##### Attention mask
         # causal attention mask
-        if self.attn_type == 'uni':
+        if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
             attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == 'bi':
+        elif self.attn_type == "bi":
             attn_mask = None
         else:
-            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
 
         # data mask: input mask & perm mask
-        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \
+        assert input_mask is None or attention_mask is None, (
+            "You can only use one of input_mask (uses 1 for padding) "
             "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        )
         if input_mask is None and attention_mask is not None:
             input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
         if input_mask is not None and perm_mask is not None:
@@ -564,8 +580,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         if data_mask is not None:
             # all mems can be attended to
-            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
-                                dtype=dtype_float)
+            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float)
             data_mask = tf.concat([mems_mask, data_mask], axis=1)
             if attn_mask is None:
                 attn_mask = data_mask[:, :, :, None]
@@ -590,9 +605,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         output_h = self.dropout(word_emb_k, training=training)
         if target_mapping is not None:
             word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
-        # else:  # We removed the inp_q input which was same as target mapping
-        #     inp_q_ext = inp_q[:, :, None]
-        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q, training=training)
         else:
             output_g = None
@@ -604,9 +619,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             cat_ids = tf.concat([mem_pad, token_type_ids], 0)
 
             # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = tf.cast(
-                tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
-                tf.int32)
+            seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32)
             seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
         else:
             seg_mat = None
@@ -626,7 +639,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -643,9 +658,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
-            outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask,
-                                    pos_emb, seg_mat, mems[i], target_mapping,
-                                    head_mask[i]], training=training)
+            outputs = layer_module(
+                [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]],
+                training=training,
+            )
             output_h, output_g = outputs[:2]
             if self.output_attentions:
                 attentions.append(outputs[2])
@@ -679,6 +695,7 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLNetConfig
     pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
@@ -784,8 +801,12 @@ XLNET_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
-                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetModel(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -816,18 +837,22 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
 
     def call(self, inputs, **kwargs):
         outputs = self.transformer(inputs, **kwargs)
         return outputs
 
 
-@add_start_docstrings("""XLNet Model with a language modeling head on top
+@add_start_docstrings(
+    """XLNet Model with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -865,10 +890,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
 
     def get_output_embeddings(self):
         return self.lm_loss.input_embeddings
@@ -883,9 +909,12 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -916,15 +945,18 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
         logits = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
-        self.logits_proj = tf.keras.layers.Dense(config.num_labels,
-                                                 kernel_initializer=get_initializer(config.initializer_range),
-                                                 name='logits_proj')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.logits_proj = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -938,9 +970,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -971,14 +1006,15 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
 
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -1027,12 +1063,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
         start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
 
     def call(self, inputs, **kwargs):
         transformer_outputs = self.transformer(inputs, **kwargs)
@@ -1044,10 +1081,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
 
-        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (start_logits, end_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         return outputs  # start_logits, end_logits, (mems), (hidden_states), (attentions)
 
+
 # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
 #     the hidden-states output to compute `span start logits` and `span end logits`). """,
 #     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index 70ef4aea3..cee61ed37 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -42,65 +42,62 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
 
+
 def build_tf_to_pytorch_map(model, config):
     """ A map of modules from TF to PyTorch.
         This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
     """
     tf_to_pt_map = {}
 
-    if hasattr(model, 'transformer'):
+    if hasattr(model, "transformer"):
         # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
-        tf_to_pt_map.update({
-            "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
-            "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
-        for i, (out_l, proj_l, tie_proj) in enumerate(zip(
-                                model.crit.out_layers,
-                                model.crit.out_projs,
-                                config.tie_projs)):
+        tf_to_pt_map.update(
+            {
+                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
+            }
+        )
+        for i, (out_l, proj_l, tie_proj) in enumerate(
+            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
+        ):
             layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
             if config.tie_weight:
-                tf_to_pt_map.update({
-                    layer_str + 'b': out_l.bias})
+                tf_to_pt_map.update({layer_str + "b": out_l.bias})
             else:
                 raise NotImplementedError
                 # I don't think this is implemented in the TF code
-                tf_to_pt_map.update({
-                    layer_str + 'lookup_table': out_l.weight,
-                    layer_str + 'b': out_l.bias})
+                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
             if not tie_proj:
-                tf_to_pt_map.update({
-                    layer_str + 'proj': proj_l
-                    })
+                tf_to_pt_map.update({layer_str + "proj": proj_l})
         # Now load the rest of the transformer
         model = model.transformer
 
     # Embeddings
     for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
         layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + 'lookup_table': embed_l.weight,
-            layer_str + 'proj_W': proj_l
-            })
+        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
 
     # Transformer blocks
     for i, b in enumerate(model.layers):
         layer_str = "transformer/layer_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
-            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
-            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
-            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
-            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
-            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
-            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
-            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
-            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
-            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
-            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
-        })
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+            }
+        )
 
     # Relative positioning biases
     if config.untie_r:
@@ -112,11 +109,10 @@ def build_tf_to_pytorch_map(model, config):
     else:
         r_r_list = [model.r_r_bias]
         r_w_list = [model.r_w_bias]
-    tf_to_pt_map.update({
-        'transformer/r_r_bias': r_r_list,
-        'transformer/r_w_bias': r_w_list})
+    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
     return tf_to_pt_map
 
+
 def load_tf_weights_in_transfo_xl(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -124,8 +120,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_to_pytorch_map(model, config)
@@ -143,9 +141,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if 'kernel' in name or 'proj' in name:
+        if "kernel" in name or "proj" in name:
             array = np.transpose(array)
-        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
             # Here we will split the TF weigths
             assert len(pointer) == array.shape[0]
             for i, p_i in enumerate(pointer):
@@ -166,10 +164,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
             logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
-        tf_weights.pop(name + '/Adam', None)
-        tf_weights.pop(name + '/Adam_1', None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     return model
 
 
@@ -180,17 +178,16 @@ class PositionalEmbedding(nn.Module):
         self.demb = demb
 
         inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
-        self.register_buffer('inv_freq', inv_freq)
+        self.register_buffer("inv_freq", inv_freq)
 
     def forward(self, pos_seq, bsz=None):
         sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
         pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
 
         if bsz is not None:
-            return pos_emb[:,None,:].expand(-1, bsz, -1)
+            return pos_emb[:, None, :].expand(-1, bsz, -1)
         else:
-            return pos_emb[:,None,:]
-
+            return pos_emb[:, None, :]
 
 
 class PositionwiseFF(nn.Module):
@@ -202,7 +199,8 @@ class PositionwiseFF(nn.Module):
         self.dropout = dropout
 
         self.CoreNet = nn.Sequential(
-            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Linear(d_model, d_inner),
+            nn.ReLU(inplace=True),
             nn.Dropout(dropout),
             nn.Linear(d_inner, d_model),
             nn.Dropout(dropout),
@@ -230,10 +228,22 @@ class PositionwiseFF(nn.Module):
 
 
 class RelPartialLearnableMultiHeadAttn(nn.Module):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None, output_attentions=False,
-                 layer_norm_epsilon=1e-5):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        tgt_len=None,
+        ext_len=None,
+        mem_len=None,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        output_attentions=False,
+        layer_norm_epsilon=1e-5,
+    ):
         super(RelPartialLearnableMultiHeadAttn, self).__init__()
 
         self.output_attentions = output_attentions
@@ -254,7 +264,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         self.pre_lnorm = pre_lnorm
 
-        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
             self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
             self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
         else:
@@ -299,18 +309,18 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         klen = w_head_k.size(0)
 
-        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
-        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
-        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
 
-        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
-        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
 
         rr_head_q = w_head_q + self.r_r_bias
-        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k))  # qlen x klen x bsz x n_head
         BD = self._rel_shift(BD)
 
         # [qlen x klen x bsz x n_head]
@@ -319,21 +329,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         #### compute attention probability
         if attn_mask is not None and torch.sum(attn_mask).item():
-            attn_mask = (attn_mask == 1)  # Switch to bool
+            attn_mask = attn_mask == 1  # Switch to bool
             if attn_mask.dim() == 2:
                 if next(self.parameters()).dtype == torch.float16:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[None,:,:,None], -65000).type_as(attn_score)
+                    attn_score = (
+                        attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score)
+                    )
                 else:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[None,:,:,None], -1e30).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score)
             elif attn_mask.dim() == 3:
                 if next(self.parameters()).dtype == torch.float16:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[:,:,:,None], -65000).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score)
                 else:
-                    attn_score = attn_score.float().masked_fill(
-                        attn_mask[:,:,:,None], -1e30).type_as(attn_score)
+                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score)
 
         # [qlen x klen x bsz x n_head]
         attn_prob = F.softmax(attn_score, dim=1)
@@ -344,11 +352,10 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
             attn_prob = attn_prob * head_mask
 
         #### compute attention vector
-        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+        attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
 
         # [qlen x bsz x n_head x d_head]
-        attn_vec = attn_vec.contiguous().view(
-            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+        attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
 
         ##### linear projection
         attn_out = self.o_net(attn_vec)
@@ -368,21 +375,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
 
 class RelPartialLearnableDecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5,
-                 **kwargs):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
         super(RelPartialLearnableDecoderLayer, self).__init__()
 
-        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
-                            d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
-                                     pre_lnorm=kwargs.get('pre_lnorm'),
-                                     layer_norm_epsilon=layer_norm_epsilon)
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(
+            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
+        )
+        self.pos_ff = PositionwiseFF(
+            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
+        )
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
 
-        attn_outputs = self.dec_attn(dec_inp, r,
-                               attn_mask=dec_attn_mask,
-                               mems=mems, head_mask=head_mask)
+        attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask)
         ff_output = self.pos_ff(attn_outputs[0])
 
         outputs = [ff_output] + attn_outputs[1:]
@@ -391,8 +396,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
 
 
 class AdaptiveEmbedding(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
-                 sample_softmax=False):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
         super(AdaptiveEmbedding, self).__init__()
 
         self.n_token = n_token
@@ -409,28 +413,25 @@ class AdaptiveEmbedding(nn.Module):
         self.emb_layers = nn.ModuleList()
         self.emb_projs = nn.ParameterList()
         if div_val == 1:
-            self.emb_layers.append(
-                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
-            )
+            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
             if d_proj != d_embed:
                 self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
                 self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
 
     def forward(self, inp):
         if self.div_val == 1:
             embed = self.emb_layers[0](inp)
             if self.d_proj != self.d_embed:
-                embed  = F.linear(embed, self.emb_projs[0])
+                embed = F.linear(embed, self.emb_projs[0])
         else:
             param = next(self.parameters())
             inp_flat = inp.view(-1)
-            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
-                dtype=param.dtype, device=param.device)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
             for i in range(len(self.cutoffs)):
                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
 
@@ -458,15 +459,16 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = TransfoXLConfig
     pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
     def _init_weight(self, weight):
-        if self.config.init == 'uniform':
+        if self.config.init == "uniform":
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
-        elif self.config.init == 'normal':
+        elif self.config.init == "normal":
             nn.init.normal_(weight, 0.0, self.config.init_std)
 
     def _init_bias(self, bias):
@@ -476,41 +478,41 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         """ Initialize the weights.
         """
         classname = m.__class__.__name__
-        if classname.find('Linear') != -1:
-            if hasattr(m, 'weight') and m.weight is not None:
+        if classname.find("Linear") != -1:
+            if hasattr(m, "weight") and m.weight is not None:
                 self._init_weight(m.weight)
-            if hasattr(m, 'bias') and m.bias is not None:
+            if hasattr(m, "bias") and m.bias is not None:
                 self._init_bias(m.bias)
-        elif classname.find('AdaptiveEmbedding') != -1:
-            if hasattr(m, 'emb_projs'):
+        elif classname.find("AdaptiveEmbedding") != -1:
+            if hasattr(m, "emb_projs"):
                 for i in range(len(m.emb_projs)):
                     if m.emb_projs[i] is not None:
                         nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('Embedding') != -1:
-            if hasattr(m, 'weight'):
+        elif classname.find("Embedding") != -1:
+            if hasattr(m, "weight"):
                 self._init_weight(m.weight)
-        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
-            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
+            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
                 self._init_weight(m.cluster_weight)
-            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
                 self._init_bias(m.cluster_bias)
-            if hasattr(m, 'out_projs'):
+            if hasattr(m, "out_projs"):
                 for i in range(len(m.out_projs)):
                     if m.out_projs[i] is not None:
                         nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('LayerNorm') != -1:
-            if hasattr(m, 'weight'):
+        elif classname.find("LayerNorm") != -1:
+            if hasattr(m, "weight"):
                 nn.init.normal_(m.weight, 1.0, self.config.init_std)
-            if hasattr(m, 'bias') and m.bias is not None:
+            if hasattr(m, "bias") and m.bias is not None:
                 self._init_bias(m.bias)
         else:
-            if hasattr(m, 'r_emb'):
+            if hasattr(m, "r_emb"):
                 self._init_weight(m.r_emb)
-            if hasattr(m, 'r_w_bias'):
+            if hasattr(m, "r_w_bias"):
                 self._init_weight(m.r_w_bias)
-            if hasattr(m, 'r_r_bias'):
+            if hasattr(m, "r_r_bias"):
                 self._init_weight(m.r_r_bias)
-            if hasattr(m, 'r_bias'):
+            if hasattr(m, "r_bias"):
                 self._init_bias(m.r_bias)
 
 
@@ -559,8 +561,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TransfoXLModel(TransfoXLPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -587,6 +593,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         last_hidden_states, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -599,8 +606,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.n_head = config.n_head
         self.d_head = config.d_head
 
-        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
-                                          div_val=config.div_val)
+        self.word_emb = AdaptiveEmbedding(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
 
         self.drop = nn.Dropout(config.dropout)
 
@@ -618,27 +626,35 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
 
         self.layers = nn.ModuleList()
-        if config.attn_type == 0: # the default attention
+        if config.attn_type == 0:  # the default attention
             for i in range(config.n_layer):
                 self.layers.append(
                     RelPartialLearnableDecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        tgt_len=config.tgt_len,
+                        ext_len=config.ext_len,
+                        mem_len=config.mem_len,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
                         r_r_bias=None if config.untie_r else self.r_r_bias,
                         output_attentions=self.output_attentions,
-                        layer_norm_epsilon=config.layer_norm_epsilon)
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                    )
                 )
-        else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
+        else:  # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
             raise NotImplementedError  # Removed them to avoid maintaining dead code
 
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
-        if self.attn_type == 0: # default attention
+        if self.attn_type == 0:  # default attention
             self.pos_emb = PositionalEmbedding(self.d_model)
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         self.init_weights()
@@ -666,8 +682,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             mems = []
             param = next(self.parameters())
             for i in range(self.n_layer):
-                empty = torch.zeros(self.mem_len, bsz, self.config.d_model,
-                                    dtype=param.dtype, device=param.device)
+                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
                 mems.append(empty)
 
             return mems
@@ -676,10 +691,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     def _update_mems(self, hids, mems, qlen, mlen):
         # does not deal with None
-        if mems is None: return None
+        if mems is None:
+            return None
 
         # mems is not None
-        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
 
         # There are `mlen + qlen` steps that can be cached into mems
         # For the next step, the last `ext_len` of the `qlen` tokens
@@ -725,7 +741,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -743,17 +761,16 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 mask_shift_len = qlen - mask_len
             else:
                 mask_shift_len = qlen
-            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-                    + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
+            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
         else:
-            dec_attn_mask = torch.triu(
-                word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
+            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[
+                :, :, None
+            ]
 
         hids = []
         attentions = []
-        if self.attn_type == 0: # default
-            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
-                                   dtype=word_emb.dtype)
+        if self.attn_type == 0:  # default
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype)
             if self.clamp_len > 0:
                 pos_seq.clamp_(max=self.clamp_len)
             pos_emb = self.pos_emb(pos_seq)
@@ -764,12 +781,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             for i, layer in enumerate(self.layers):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
-                layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask,
-                                      mems=mems_i, head_mask=head_mask[i])
+                layer_outputs = layer(
+                    core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i]
+                )
                 core_out = layer_outputs[0]
                 if self.output_attentions:
                     attentions.append(layer_outputs[1])
-        else: # learnable embeddings and absolute embeddings
+        else:  # learnable embeddings and absolute embeddings
             raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         core_out = self.drop(core_out)
@@ -791,9 +809,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
 
-@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+@add_start_docstrings(
+    """The Transformer-XL Model with a language modeling head on top
     (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+    TRANSFO_XL_START_DOCSTRING,
+    TRANSFO_XL_INPUTS_DOCSTRING,
+)
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -830,6 +851,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         prediction_scores, mems = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(TransfoXLLMHeadModel, self).__init__(config)
         self.transformer = TransfoXLModel(config)
@@ -840,8 +862,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
             self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
-                                                    config.cutoffs, div_val=config.div_val)
+            self.crit = ProjectedAdaptiveLogSoftmax(
+                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+            )
         self.init_weights()
 
     def tie_weights(self):
@@ -856,8 +879,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         else:
             if self.config.tie_weight:
                 for i in range(len(self.crit.out_layers)):
-                    self._tie_or_clone_weights(self.crit.out_layers[i],
-                                               self.transformer.word_emb.emb_layers[i])
+                    self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
             if self.config.tie_projs:
                 for i, tie_proj in enumerate(self.config.tie_projs):
                     if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
index 0773d0d5f..89451bb55 100644
--- a/transformers/modeling_transfo_xl_utilities.py
+++ b/transformers/modeling_transfo_xl_utilities.py
@@ -28,9 +28,9 @@ import torch.nn.functional as F
 # CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
 # CUDA_MINOR = int(torch.version.cuda.split('.')[1])
 
+
 class ProjectedAdaptiveLogSoftmax(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
-                 keep_order=False):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
         super(ProjectedAdaptiveLogSoftmax, self).__init__()
 
         self.n_token = n_token
@@ -55,23 +55,19 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         if div_val == 1:
             for i in range(len(self.cutoffs)):
                 if d_proj != d_embed:
-                    self.out_projs.append(
-                        nn.Parameter(torch.FloatTensor(d_proj, d_embed))
-                    )
+                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
                 else:
                     self.out_projs.append(None)
 
             self.out_layers.append(nn.Linear(d_embed, n_token))
         else:
             for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                 d_emb_i = d_embed // (div_val ** i)
 
-                self.out_projs.append(
-                    nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))
-                )
+                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
 
-                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
 
         self.keep_order = keep_order
 
@@ -90,7 +86,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         return logit
 
     def forward(self, hidden, labels=None, keep_order=False):
-        '''
+        """
             Params:
                 hidden :: [len*bsz x d_proj]
                 labels :: [len*bsz]
@@ -102,20 +98,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             We could replace this implementation by the native PyTorch one
             if their's had an option to set bias on all clusters in the native one.
             here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
-        '''
+        """
 
         if labels is not None:
             labels = labels.view(-1)
             if hidden.size(0) != labels.size(0):
-                raise RuntimeError('Input and labels should have the same size '
-                                'in the batch dimension.')
+                raise RuntimeError("Input and labels should have the same size " "in the batch dimension.")
 
         if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight,
-                                        self.out_layers[0].bias, self.out_projs[0])
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
             if labels is not None:
-                out = -F.log_softmax(logit, dim=-1) \
-                        .gather(1, labels.unsqueeze(1)).squeeze(1)
+                out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
             else:
                 out = F.log_softmax(logit, dim=-1)
         else:
@@ -131,10 +124,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     bias_i = self.out_layers[i].bias
 
                 if i == 0:
-                    weight_i = torch.cat(
-                        [weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat(
-                        [bias_i, self.cluster_bias], dim=0)
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
 
                 weights.append(weight_i)
                 biases.append(bias_i)
@@ -171,7 +162,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     if labels is not None:
                         logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                     else:
-                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                        out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
                 else:
                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
 
@@ -179,22 +170,22 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
                     if labels is not None:
-                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
-                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
+                            1, target_i[:, None]
+                        ).squeeze(1)
                     else:
                         logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
                         out[:, l_idx:r_idx] = logprob_i
 
                 if labels is not None:
-                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                    if (hasattr(self, "keep_order") and self.keep_order) or keep_order:
                         out.index_copy_(0, indices_i, -logprob_i)
                     else:
-                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                        out[offset : offset + logprob_i.size(0)].copy_(-logprob_i)
                     offset += logprob_i.size(0)
 
         return out
 
-
     def log_prob(self, hidden):
         r""" Computes log probabilities for all :math:`n\_classes`
         From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
@@ -209,8 +200,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             - Output: :math:`(N, n\_classes)`
         """
         if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight,
-                                        self.out_layers[0].bias, self.out_projs[0])
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
             return F.log_softmax(logit, dim=-1)
         else:
             # construct weights and biases
@@ -225,10 +215,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     bias_i = self.out_layers[i].bias
 
                 if i == 0:
-                    weight_i = torch.cat(
-                        [weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat(
-                        [bias_i, self.cluster_bias], dim=0)
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
 
                 weights.append(weight_i)
                 biases.append(bias_i)
@@ -244,7 +232,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                 start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
 
                 if i == 0:
-                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
                 else:
                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
 
@@ -270,10 +258,10 @@ class LogUniformSampler(object):
         """
         with torch.no_grad():
             self.range_max = range_max
-            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            log_indices = torch.arange(1.0, range_max + 2.0, 1.0).log_()
             self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
 
-            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+            self.log_q = (-(-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
 
         self.n_sample = n_sample
 
@@ -298,6 +286,7 @@ class LogUniformSampler(object):
             samp_log_probs = self.log_q[neg_samples].to(device)
             return true_log_probs, samp_log_probs, neg_samples
 
+
 def sample_logits(embedding, bias, labels, inputs, sampler):
     """
         embedding: an nn.Embedding layer
@@ -313,19 +302,17 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
     b1, b2 = labels.size(0), labels.size(1)
     all_ids = torch.cat([labels.view(-1), neg_samples])
     all_w = embedding(all_ids)
-    true_w = all_w[: -n_sample].view(b1, b2, -1)
-    sample_w = all_w[- n_sample:].view(n_sample, -1)
+    true_w = all_w[:-n_sample].view(b1, b2, -1)
+    sample_w = all_w[-n_sample:].view(n_sample, -1)
 
     all_b = bias[all_ids]
-    true_b = all_b[: -n_sample].view(b1, b2)
-    sample_b = all_b[- n_sample:]
+    true_b = all_b[:-n_sample].view(b1, b2)
+    sample_b = all_b[-n_sample:]
 
     hit = (labels[:, :, None] == neg_samples).detach()
 
-    true_logits = torch.einsum('ijk,ijk->ij',
-        [true_w, inputs]) + true_b - true_log_probs
-    sample_logits = torch.einsum('lk,ijk->ijl',
-        [sample_w, inputs]) + sample_b - samp_log_probs
+    true_logits = torch.einsum("ijk,ijk->ij", [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum("lk,ijk->ijl", [sample_w, inputs]) + sample_b - samp_log_probs
     sample_logits.masked_fill_(hit, -1e30)
     logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 05e5ed357..e934b9052 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
 import json
@@ -31,8 +30,15 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
-                         cached_path, hf_bucket_url, is_remote_url)
+from .file_utils import (
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    DUMMY_INPUTS,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -43,12 +49,14 @@ except ImportError:
     class Identity(nn.Module):
         r"""A placeholder identity operator that is argument-insensitive.
         """
+
         def __init__(self, *args, **kwargs):
             super(Identity, self).__init__()
 
         def forward(self, input):
             return input
 
+
 class PreTrainedModel(nn.Module):
     r""" Base class for all models.
 
@@ -78,7 +86,7 @@ class PreTrainedModel(nn.Module):
         Returns:
             torch.Tensor with dummy inputs
         """
-        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
+        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
@@ -88,7 +96,8 @@ class PreTrainedModel(nn.Module):
                 "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
-                ))
+                )
+            )
         # Save config in model
         self.config = config
 
@@ -136,14 +145,14 @@ class PreTrainedModel(nn.Module):
         else:
             output_embeddings.weight = input_embeddings.weight
 
-        if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None:
+        if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None:
             output_embeddings.bias.data = torch.nn.functional.pad(
                 output_embeddings.bias.data,
                 (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
-                'constant',
-                0
+                "constant",
+                0,
             )
-        if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'):
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
             output_embeddings.out_features = input_embeddings.num_embeddings
 
     def resize_token_embeddings(self, new_num_tokens=None):
@@ -244,10 +253,12 @@ class PreTrainedModel(nn.Module):
         """ Save a model and its configuration file to a directory, so that it
             can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
 
         # Only save the model itself if we are using distributed training
-        model_to_save = self.module if hasattr(self, 'module') else self
+        model_to_save = self.module if hasattr(self, "module") else self
 
         # Save configuration file
         model_to_save.config.save_pretrained(save_directory)
@@ -329,21 +340,23 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        config = kwargs.pop('config', None)
-        state_dict = kwargs.pop('state_dict', None)
-        cache_dir = kwargs.pop('cache_dir', None)
-        from_tf = kwargs.pop('from_tf', False)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
-        output_loading_info = kwargs.pop('output_loading_info', False)
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_tf = kwargs.pop("from_tf", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
 
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                config_path, *model_args,
-                cache_dir=cache_dir, return_unused_kwargs=True,
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
                 proxies=proxies,
@@ -367,43 +380,56 @@ class PreTrainedModel(nn.Module):
                     # Load from a PyTorch checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
                 else:
-                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
-                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
-                        pretrained_model_name_or_path))
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path
+                        )
+                    )
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
+                )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
                 if from_tf:
-                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
+                    raise EnvironmentError(
+                        "Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name."
+                    )
 
             # redirect to the cache, if necessary
             try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
-                                                    proxies=proxies, resume_download=resume_download)
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                )
             except EnvironmentError:
                 if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
-                            archive_file)
+                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)
                 else:
-                    msg = "Model name '{}' was not found in model name list ({}). " \
-                        "We assumed '{}' was a path or url to model weight files named one of {} but " \
+                    msg = (
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url to model weight files named one of {} but "
                         "couldn't find any such file at this path or url.".format(
                             pretrained_model_name_or_path,
-                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            ", ".join(cls.pretrained_model_archive_map.keys()),
                             archive_file,
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME],
+                        )
+                    )
                 raise EnvironmentError(msg)
 
             if resolved_archive_file == archive_file:
                 logger.info("loading weights file {}".format(archive_file))
             else:
-                logger.info("loading weights file {} from cache at {}".format(
-                    archive_file, resolved_archive_file))
+                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
         else:
             resolved_archive_file = None
 
@@ -412,27 +438,32 @@ class PreTrainedModel(nn.Module):
 
         if state_dict is None and not from_tf:
             try:
-                state_dict = torch.load(resolved_archive_file, map_location='cpu')
+                state_dict = torch.load(resolved_archive_file, map_location="cpu")
             except:
-                raise OSError("Unable to load weights from pytorch checkpoint file. "
-                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
+                raise OSError(
+                    "Unable to load weights from pytorch checkpoint file. "
+                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
+                )
 
         missing_keys = []
         unexpected_keys = []
         error_msgs = []
 
         if from_tf:
-            if resolved_archive_file.endswith('.index'):
+            if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
                 model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
             else:
                 # Load from our TensorFlow 2.0 checkpoints
                 try:
                     from transformers import load_tf2_checkpoint_in_pytorch_model
+
                     model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
                 except ImportError as e:
-                    logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+                    logger.error(
+                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+                    )
                     raise e
         else:
             # Convert old format to new format if needed from a PyTorch state_dict
@@ -440,10 +471,10 @@ class PreTrainedModel(nn.Module):
             new_keys = []
             for key in state_dict.keys():
                 new_key = None
-                if 'gamma' in key:
-                    new_key = key.replace('gamma', 'weight')
-                if 'beta' in key:
-                    new_key = key.replace('beta', 'bias')
+                if "gamma" in key:
+                    new_key = key.replace("gamma", "weight")
+                if "beta" in key:
+                    new_key = key.replace("beta", "bias")
                 if new_key:
                     old_keys.append(key)
                     new_keys.append(new_key)
@@ -451,39 +482,53 @@ class PreTrainedModel(nn.Module):
                 state_dict[new_key] = state_dict.pop(old_key)
 
             # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, '_metadata', None)
+            metadata = getattr(state_dict, "_metadata", None)
             state_dict = state_dict.copy()
             if metadata is not None:
                 state_dict._metadata = metadata
 
             # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
             # so we need to apply the function recursively.
-            def load(module, prefix=''):
+            def load(module, prefix=""):
                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                 module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+                )
                 for name, child in module._modules.items():
                     if child is not None:
-                        load(child, prefix + name + '.')
+                        load(child, prefix + name + ".")
 
             # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ''
+            start_prefix = ""
             model_to_load = model
-            if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
-                start_prefix = cls.base_model_prefix + '.'
-            if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            if not hasattr(model, cls.base_model_prefix) and any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+            ):
+                start_prefix = cls.base_model_prefix + "."
+            if hasattr(model, cls.base_model_prefix) and not any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+            ):
                 model_to_load = getattr(model, cls.base_model_prefix)
 
             load(model_to_load, prefix=start_prefix)
             if len(missing_keys) > 0:
-                logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, missing_keys))
+                logger.info(
+                    "Weights of {} not initialized from pretrained model: {}".format(
+                        model.__class__.__name__, missing_keys
+                    )
+                )
             if len(unexpected_keys) > 0:
-                logger.info("Weights from pretrained model not used in {}: {}".format(
-                    model.__class__.__name__, unexpected_keys))
+                logger.info(
+                    "Weights from pretrained model not used in {}: {}".format(
+                        model.__class__.__name__, unexpected_keys
+                    )
+                )
             if len(error_msgs) > 0:
-                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                                model.__class__.__name__, "\n\t".join(error_msgs)))
+                raise RuntimeError(
+                    "Error(s) in loading state_dict for {}:\n\t{}".format(
+                        model.__class__.__name__, "\n\t".join(error_msgs)
+                    )
+                )
 
         model.tie_weights()  # make sure word embedding weights are still tied if needed
 
@@ -500,10 +545,22 @@ class PreTrainedModel(nn.Module):
         return {"input_ids": input_ids}
 
     @torch.no_grad()
-    def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None,
-                 temperature=None, top_k=None, top_p=None, repetition_penalty=None,
-                 bos_token_id=None, pad_token_id=None, eos_token_ids=None,
-                 length_penalty=None, num_return_sequences=None):
+    def generate(
+        self,
+        input_ids=None,
+        max_length=None,
+        do_sample=None,
+        num_beams=None,
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        repetition_penalty=None,
+        bos_token_id=None,
+        pad_token_id=None,
+        eos_token_ids=None,
+        length_penalty=None,
+        num_return_sequences=None,
+    ):
         """ Sequence generator for models with a LM head.
 
         The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
@@ -543,8 +600,10 @@ class PreTrainedModel(nn.Module):
 
         # We cannot generate if the model does not have a LM head
         if self.get_output_embeddings() is None:
-            raise AttributeError("You tried to generate sequences with a model that does not have a LM Head."
-                                 "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)")
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)"
+            )
 
         max_length = max_length if max_length is not None else self.config.max_length
         do_sample = do_sample if do_sample is not None else self.config.do_sample
@@ -557,7 +616,9 @@ class PreTrainedModel(nn.Module):
         pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
         eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
         length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
 
         if input_ids is not None:
             batch_size = input_ids.shape[0]  # overriden by the input batch_size
@@ -575,13 +636,18 @@ class PreTrainedModel(nn.Module):
         assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
         assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer."
         assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer."
-        assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \
-                   "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
+        assert isinstance(eos_token_ids, (list, tuple)) and (
+            e >= 0 for e in eos_token_ids
+        ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
         assert length_penalty > 0, "`length_penalty` should be strictely positive."
-        assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictely positive integer."
 
         if input_ids is None:
-            input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device)
+            input_ids = torch.full(
+                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device
+            )
         else:
             assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
 
@@ -592,28 +658,63 @@ class PreTrainedModel(nn.Module):
         if num_return_sequences != 1:
             # Expand input to num return sequences
             input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
-            input_ids = input_ids.contiguous().view(batch_size * num_return_sequences, cur_len)   # (batch_size * num_return_sequences, cur_len)
+            input_ids = input_ids.contiguous().view(
+                batch_size * num_return_sequences, cur_len
+            )  # (batch_size * num_return_sequences, cur_len)
             effective_batch_size = batch_size * num_return_sequences
         else:
             effective_batch_size = batch_size
 
         if num_beams > 1:
-            output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample,
-                                                temperature, top_k, top_p, repetition_penalty,
-                                                pad_token_id, eos_token_ids, effective_batch_size,
-                                                length_penalty, num_beams, vocab_size)
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+                length_penalty,
+                num_beams,
+                vocab_size,
+            )
         else:
-            output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample,
-                                             temperature, top_k, top_p, repetition_penalty,
-                                             pad_token_id, eos_token_ids, effective_batch_size)
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len,
+                max_length,
+                do_sample,
+                temperature,
+                top_k,
+                top_p,
+                repetition_penalty,
+                pad_token_id,
+                eos_token_ids,
+                effective_batch_size,
+            )
 
         if num_return_sequences != 1:
             output = output.view(batch_size, num_return_sequences, -1)
         return output
 
-    def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample,
-                                 temperature, top_k, top_p, repetition_penalty,
-                                 pad_token_id, eos_token_ids, batch_size):
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+    ):
         """ Generate sequences for each example without beam search (num_beams == 1).
             All returned sequence are generated independantly.
         """
@@ -663,23 +764,38 @@ class PreTrainedModel(nn.Module):
 
         return input_ids
 
-    def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample,
-                              temperature, top_k, top_p, repetition_penalty,
-                              pad_token_id, eos_token_ids, batch_size,
-                              length_penalty, num_beams, vocab_size):
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        pad_token_id,
+        eos_token_ids,
+        batch_size,
+        length_penalty,
+        num_beams,
+        vocab_size,
+    ):
         """ Generate sequences for each example with beam search.
         """
         # Expand input to num beams
         input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
-        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)   # (batch_size * num_beams, cur_len)
+        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
 
         # generated hypotheses
-        generated_hyps = [BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)]
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
+        ]
 
         # scores for each sentence in the beam
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
         beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)                                      # shape (batch_size * num_beams,)
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
 
         # cache compute states
         pasts = None  # self.prepare_pasts()
@@ -689,8 +805,8 @@ class PreTrainedModel(nn.Module):
 
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts)
-            scores = self(**model_inputs)[0]                                    # (batch_size * num_beams, cur_len, vocab_size)
-            scores = scores[:, -1, :]                                           # (batch_size * num_beams, vocab_size)
+            scores = self(**model_inputs)[0]  # (batch_size * num_beams, cur_len, vocab_size)
+            scores = scores[:, -1, :]  # (batch_size * num_beams, vocab_size)
 
             # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
             if repetition_penalty != 1.0:
@@ -703,25 +819,27 @@ class PreTrainedModel(nn.Module):
                 if temperature > 0 and temperature != 1.0:
                     scores = scores / temperature
                 # Top-p/top-k filtering
-                scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2)  # (batch_size * num_beams, vocab_size)
+                scores = top_k_top_p_filtering(
+                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
                 # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)    # (batch_size * num_beams, 2)
+                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)  # (batch_size * num_beams, 2)
                 # Compute next scores
-                _scores = F.log_softmax(scores, dim=-1)                                     # (batch_size * num_beams, vocab_size)
-                _scores = torch.gather(_scores, -1, next_words)                             # (batch_size * num_beams, 2)
-                next_scores = _scores + beam_scores[:, None].expand_as(_scores)             # (batch_size * num_beams, 2)
+                _scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+                _scores = torch.gather(_scores, -1, next_words)  # (batch_size * num_beams, 2)
+                next_scores = _scores + beam_scores[:, None].expand_as(_scores)  # (batch_size * num_beams, 2)
                 # Match shape of greedy beam search
-                next_words = next_words.view(batch_size, 2 * num_beams)                     # (batch_size, 2 * num_beams)
-                next_scores = next_scores.view(batch_size, 2 * num_beams)                   # (batch_size, 2 * num_beams)
+                next_words = next_words.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
+                next_scores = next_scores.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
             else:
                 # do greedy beam search
-                scores = F.log_softmax(scores, dim=-1)                          # (batch_size * num_beams, vocab_size)
+                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
                 assert scores.size() == (batch_size * num_beams, vocab_size)
                 # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(scores)       # (batch_size * num_beams, vocab_size)
+                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(batch_size, num_beams * vocab_size)      # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores, 2*num_beams, dim=1, largest=True, sorted=True)
+                _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
+                next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
 
             assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
 
@@ -750,7 +868,9 @@ class PreTrainedModel(nn.Module):
 
                     # end of sentence, or next word
                     if word_id.item() in eos_token_ids or cur_len + 1 == max_length:
-                        generated_hyps[batch_ex].add(input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item())
+                        generated_hyps[batch_ex].add(
+                            input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()
+                        )
                     else:
                         next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id))
 
@@ -807,13 +927,13 @@ class PreTrainedModel(nn.Module):
         # generate target batch
         decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id)
         for i, hypo in enumerate(best):
-            decoded[i, :tgt_len[i] - 1] = hypo
+            decoded[i, : tgt_len[i] - 1] = hypo
             decoded[i, tgt_len[i] - 1] = eos_token_ids[0]
 
         return decoded
 
 
-def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf'), min_tokens_to_keep=1):
+def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
         Args:
             logits: logits distribution shape (batch size, vocabulary size)
@@ -849,7 +969,6 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf')
 
 
 class BeamHypotheses(object):
-
     def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
         """
         Initialize n-best list of hypotheses.
@@ -915,6 +1034,7 @@ class Conv1D(nn.Module):
 
 class PoolerStartLogits(nn.Module):
     """ Compute SQuAD start_logits from sequence hidden states. """
+
     def __init__(self, config):
         super(PoolerStartLogits, self).__init__()
         self.dense = nn.Linear(config.hidden_size, 1)
@@ -939,6 +1059,7 @@ class PoolerStartLogits(nn.Module):
 class PoolerEndLogits(nn.Module):
     """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
     """
+
     def __init__(self, config):
         super(PoolerEndLogits, self).__init__()
         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
@@ -959,12 +1080,14 @@ class PoolerEndLogits(nn.Module):
                 Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
                 1.0 means token should be masked.
         """
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
             slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
 
         x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
         x = self.activation(x)
@@ -982,6 +1105,7 @@ class PoolerEndLogits(nn.Module):
 
 class PoolerAnswerClass(nn.Module):
     """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+
     def __init__(self, config):
         super(PoolerAnswerClass, self).__init__()
         self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
@@ -1006,16 +1130,18 @@ class PoolerAnswerClass(nn.Module):
                 for each sample
         """
         hsz = hidden_states.shape[-1]
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
 
         if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
         else:
-            cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
 
         x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
         x = self.activation(x)
@@ -1064,6 +1190,7 @@ class SQuADHead(nn.Module):
             ``torch.FloatTensor`` of shape ``(batch_size,)``
             Log probabilities for the ``is_impossible`` label of the answers.
     """
+
     def __init__(self, config):
         super(SQuADHead, self).__init__()
         self.start_n_top = config.start_n_top
@@ -1073,8 +1200,9 @@ class SQuADHead(nn.Module):
         self.end_logits = PoolerEndLogits(config)
         self.answer_class = PoolerAnswerClass(config)
 
-    def forward(self, hidden_states, start_positions=None, end_positions=None,
-                cls_index=None, is_impossible=None, p_mask=None):
+    def forward(
+        self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None
+    ):
         outputs = ()
 
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
@@ -1107,19 +1235,25 @@ class SQuADHead(nn.Module):
         else:
             # during inference, compute the end logits based on beam search
             bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
 
-            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
 
@@ -1148,34 +1282,35 @@ class SequenceSummary(nn.Module):
             summary_first_dropout: Add a dropout before the projection and activation
             summary_last_dropout: Add a dropout after the projection and activation
     """
+
     def __init__(self, config):
         super(SequenceSummary, self).__init__()
 
-        self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
-        if self.summary_type == 'attn':
+        self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
+        if self.summary_type == "attn":
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
             raise NotImplementedError
 
         self.summary = Identity()
-        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                 num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
 
         self.activation = Identity()
-        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+        if hasattr(config, "summary_activation") and config.summary_activation == "tanh":
             self.activation = nn.Tanh()
 
         self.first_dropout = Identity()
-        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
             self.first_dropout = nn.Dropout(config.summary_first_dropout)
 
         self.last_dropout = Identity()
-        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, cls_index=None):
@@ -1185,21 +1320,21 @@ class SequenceSummary(nn.Module):
                 if summary_type == 'cls_index' and cls_index is None:
                     we take the last token of the sequence as classification token
         """
-        if self.summary_type == 'last':
+        if self.summary_type == "last":
             output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
+        elif self.summary_type == "first":
             output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
+        elif self.summary_type == "mean":
             output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'cls_index':
+        elif self.summary_type == "cls_index":
             if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long)
             else:
                 cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
-        elif self.summary_type == 'attn':
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
             raise NotImplementedError
 
         output = self.first_dropout(output)
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index 5135f1e88..cd758a043 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -34,24 +34,21 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
+    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
+    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
+    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
+    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
+    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
+    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
+    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
+    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
 }
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([
-        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-        for pos in range(n_pos)
-    ])
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
     out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
     out.detach_()
@@ -142,7 +139,7 @@ class MultiHeadAttention(nn.Module):
         # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
         bs, qlen, dim = input.size()
         if kv is None:
-            klen = qlen if cache is None else cache['slen'] + qlen
+            klen = qlen if cache is None else cache["slen"] + qlen
         else:
             klen = kv.size(1)
         # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
@@ -158,39 +155,39 @@ class MultiHeadAttention(nn.Module):
             """  compute context """
             return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
 
-        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
-            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
         elif cache is None or self.layer_id not in cache:
             k = v = kv
-            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
 
         if cache is not None:
             if self.layer_id in cache:
                 if kv is None:
                     k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                 else:
                     k, v = cache[self.layer_id]
             cache[self.layer_id] = (k, v)
 
-        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
-        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
-        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, qlen, klen)
 
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
 
         # Mask heads if we want to
         if head_mask is not None:
             weights = weights * head_mask
 
-        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)                                            # (bs, qlen, dim)
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
 
         outputs = (self.out_lin(context),)
         if self.output_attentions:
@@ -199,7 +196,6 @@ class MultiHeadAttention(nn.Module):
 
 
 class TransformerFFN(nn.Module):
-
     def __init__(self, in_dim, dim_hidden, out_dim, config):
         super(TransformerFFN, self).__init__()
         self.dropout = config.dropout
@@ -219,6 +215,7 @@ class XLMPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLMConfig
     pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
@@ -235,7 +232,7 @@ class XLMPreTrainedModel(PreTrainedModel):
             langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
 
     def _init_weights(self, module):
         """ Initialize the weights. """
@@ -245,8 +242,8 @@ class XLMPreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.Linear):
             if self.config is not None and self.config.init_std is not None:
                 nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
-                if hasattr(module, 'bias') and module.bias is not None:
-                    nn.init.constant_(module.bias, 0.)
+                if hasattr(module, "bias") and module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -327,8 +324,12 @@ XLM_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMModel(XLMPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -351,7 +352,8 @@ class XLMModel(XLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
-    def __init__(self, config):  #, dico, is_encoder, with_output):
+
+    def __init__(self, config):  # , dico, is_encoder, with_output):
         super(XLMModel, self).__init__(config)
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
@@ -377,13 +379,13 @@ class XLMModel(XLMPreTrainedModel):
         # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
-        self.dim = config.emb_dim       # 512 by default
+        self.dim = config.emb_dim  # 512 by default
         self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads   # 8 by default
+        self.n_heads = config.n_heads  # 8 by default
         self.n_layers = config.n_layers
         self.dropout = config.dropout
         self.attention_dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
 
         # embeddings
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
@@ -435,8 +437,18 @@ class XLMModel(XLMPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.attentions[layer].prune_heads(heads)
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None):  # removed: src_enc=None, src_len=None
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):  # removed: src_enc=None, src_len=None
         if input_ids is not None:
             bs, slen = input_ids.size()
         else:
@@ -446,7 +458,7 @@ class XLMModel(XLMPreTrainedModel):
             if input_ids is not None:
                 lengths = (input_ids != self.pad_index).sum(dim=1).long()
             else:
-                lengths = torch.LongTensor([slen]*bs)
+                lengths = torch.LongTensor([slen] * bs)
         # mask = input_ids != self.pad_index
 
         # check inputs
@@ -488,14 +500,18 @@ class XLMModel(XLMPreTrainedModel):
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                 head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layers
 
         # do not recompute cached elements
         if cache is not None and input_ids is not None:
-            _slen = slen - cache['slen']
+            _slen = slen - cache["slen"]
             input_ids = input_ids[:, -_slen:]
             position_ids = position_ids[:, -_slen:]
             if langs is not None:
@@ -550,7 +566,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # update cache length
         if cache is not None:
-            cache['slen'] += tensor.size(1)
+            cache["slen"] += tensor.size(1)
 
         # move back sequence length to dimension 0
         # tensor = tensor.transpose(0, 1)
@@ -567,6 +583,7 @@ class XLMPredLayer(nn.Module):
     """
     Prediction layer (cross_entropy or adaptive_softmax).
     """
+
     def __init__(self, config):
         super(XLMPredLayer, self).__init__()
         self.asm = config.asm
@@ -593,7 +610,7 @@ class XLMPredLayer(nn.Module):
             scores = self.proj(x)
             outputs = (scores,) + outputs
             if y is not None:
-                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
+                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean")
                 outputs = (loss,) + outputs
         else:
             scores = self.proj.log_prob(x)
@@ -605,9 +622,12 @@ class XLMPredLayer(nn.Module):
         return outputs
 
 
-@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+@add_start_docstrings(
+    """The XLM Model transformer with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMWithLMHeadModel(XLMPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -639,6 +659,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
         self.transformer = XLMModel(config)
@@ -661,17 +682,30 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
             langs = None
         return {"input_ids": input_ids, "langs": langs}
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths,
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
         outputs = self.pred_layer(output, labels)
@@ -680,9 +714,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForSequenceClassification(XLMPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -714,6 +751,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -723,17 +761,30 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
         logits = self.sequence_summary(output)
@@ -753,9 +804,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -799,6 +853,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForQuestionAnsweringSimple, self).__init__(config)
 
@@ -807,17 +862,31 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = transformer_outputs[0]
 
@@ -826,7 +895,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = (start_logits, end_logits,)
+        outputs = (
+            start_logits,
+            end_logits,
+        )
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -849,9 +921,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+    XLM_START_DOCSTRING,
+    XLM_INPUTS_DOCSTRING,
+)
 class XLMForQuestionAnswering(XLMPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -895,6 +970,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLMForQuestionAnswering, self).__init__(config)
 
@@ -903,23 +979,45 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
-                is_impossible=None, cls_index=None, p_mask=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               langs=langs,
-                                               token_type_ids=token_type_ids,
-                                               position_ids=position_ids,
-                                               lengths=lengths, 
-                                               cache=cache,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        is_impossible=None,
+        cls_index=None,
+        p_mask=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
 
-        outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions,
-                                  cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask)
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+        )
 
         outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 0bdce941a..8f1ed6ec6 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -15,24 +15,29 @@
 # limitations under the License.
 """PyTorch XLM-RoBERTa model. """
 
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .modeling_roberta import (
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForMultipleChoice,
+    RobertaForTokenClassification,
+)
 from .configuration_xlm_roberta import XLMRobertaConfig
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
+    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
+    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
+    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
 }
 
 
@@ -105,8 +110,12 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaModel(RobertaModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -154,8 +163,11 @@ class XLMRobertaModel(RobertaModel):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -190,9 +202,12 @@ class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+@add_start_docstrings(
+    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
     on top of the pooled output) e.g. for GLUE tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -228,9 +243,12 @@ class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -262,9 +280,12 @@ class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
-@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+    XLM_ROBERTA_START_DOCSTRING,
+    XLM_ROBERTA_INPUTS_DOCSTRING,
+)
 class XLMRobertaForTokenClassification(RobertaForTokenClassification):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 3109fd8cd..2a210502d 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -29,7 +29,14 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
+from .modeling_utils import (
+    PreTrainedModel,
+    prune_linear_layer,
+    SequenceSummary,
+    PoolerAnswerClass,
+    PoolerEndLogits,
+    PoolerStartLogits,
+)
 from .configuration_xlnet import XLNetConfig
 from .file_utils import add_start_docstrings
 
@@ -37,8 +44,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
+    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
+    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
 
 
@@ -50,44 +57,53 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
 
     tf_to_pt_map = {}
 
-    if hasattr(model, 'transformer'):
-        if hasattr(model, 'lm_loss'):
+    if hasattr(model, "transformer"):
+        if hasattr(model, "lm_loss"):
             # We will load also the output bias
-            tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
-        if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
+            tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
+        if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
             # We will load also the sequence summary
-            tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
-            tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        if hasattr(model, 'logits_proj') and config.finetuning_task is not None \
-                and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights:
-            tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight
-            tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias
+            tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
+            tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
+        if (
+            hasattr(model, "logits_proj")
+            and config.finetuning_task is not None
+            and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights
+        ):
+            tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight
+            tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias
 
         # Now load the rest of the transformer
         model = model.transformer
 
     # Embeddings and output
-    tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight,
-                         'model/transformer/mask_emb/mask_emb': model.mask_emb})
+    tf_to_pt_map.update(
+        {
+            "model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
+            "model/transformer/mask_emb/mask_emb": model.mask_emb,
+        }
+    )
 
     # Transformer blocks
     for i, b in enumerate(model.layer):
         layer_str = "model/transformer/layer_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
-            layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
-            layer_str + "rel_attn/o/kernel": b.rel_attn.o,
-            layer_str + "rel_attn/q/kernel": b.rel_attn.q,
-            layer_str + "rel_attn/k/kernel": b.rel_attn.k,
-            layer_str + "rel_attn/r/kernel": b.rel_attn.r,
-            layer_str + "rel_attn/v/kernel": b.rel_attn.v,
-            layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
-            layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
-            layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
-            layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
-            layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
-            layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
-        })
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.rel_attn.o,
+                layer_str + "rel_attn/q/kernel": b.rel_attn.q,
+                layer_str + "rel_attn/k/kernel": b.rel_attn.k,
+                layer_str + "rel_attn/r/kernel": b.rel_attn.r,
+                layer_str + "rel_attn/v/kernel": b.rel_attn.v,
+                layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
+                layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
+                layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
+                layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
+            }
+        )
 
     # Relative positioning biases
     if config.untie_r:
@@ -105,13 +121,17 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
         r_w_list = [model.r_w_bias]
         r_s_list = [model.r_s_bias]
         seg_embed_list = [model.seg_embed]
-    tf_to_pt_map.update({
-        'model/transformer/r_r_bias': r_r_list,
-        'model/transformer/r_w_bias': r_w_list,
-        'model/transformer/r_s_bias': r_s_list,
-        'model/transformer/seg_embed': seg_embed_list})
+    tf_to_pt_map.update(
+        {
+            "model/transformer/r_r_bias": r_r_list,
+            "model/transformer/r_w_bias": r_w_list,
+            "model/transformer/r_s_bias": r_s_list,
+            "model/transformer/seg_embed": seg_embed_list,
+        }
+    )
     return tf_to_pt_map
 
+
 def load_tf_weights_in_xlnet(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -119,8 +139,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
         raise
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
@@ -141,7 +163,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
+        if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
             logger.info("Transposing")
             array = np.transpose(array)
         if isinstance(pointer, list):
@@ -165,10 +187,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
             logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
-        tf_weights.pop(name + '/Adam', None)
-        tf_weights.pop(name + '/Adam_1', None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
 
-    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
     return model
 
 
@@ -199,7 +221,8 @@ class XLNetRelativeAttention(nn.Module):
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head))
+                "heads (%d)" % (config.d_model, config.n_head)
+            )
 
         self.n_head = config.n_head
         self.d_head = config.d_head
@@ -242,7 +265,7 @@ class XLNetRelativeAttention(nn.Module):
 
         x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])
         x = x[:, :, 1:, :]
-        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3]-1)
+        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)
         # Note: the tensor-slice form was faster in my testing than torch.index_select
         #       However, tracing doesn't like the nature of the slice, and if klen changes
         #       during the run then it'll fail, whereas index_select will be fine.
@@ -255,27 +278,27 @@ class XLNetRelativeAttention(nn.Module):
         """Core relative positional attention operations."""
 
         # content based attention score
-        ac = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_w_bias, k_head_h)
+        ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
 
         # position based attention score
-        bd = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_r_bias, k_head_r)
+        bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
         bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
 
         # segment based attention score
         if seg_mat is None:
             ef = 0
         else:
-            ef = torch.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
-            ef = torch.einsum('ijbs,ibns->bnij', seg_mat, ef)
+            ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)
 
         # merge attention scores and perform masking
         attn_score = (ac + bd + ef) * self.scale
         if attn_mask is not None:
             # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
             if attn_mask.dtype == torch.float16:
-                attn_score = attn_score - 65500 * torch.einsum('ijbn->bnij', attn_mask)
+                attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
             else:
-                attn_score = attn_score - 1e30 * torch.einsum('ijbn->bnij', attn_mask)
+                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
 
         # attention probability
         attn_prob = F.softmax(attn_score, dim=3)
@@ -283,20 +306,20 @@ class XLNetRelativeAttention(nn.Module):
 
         # Mask heads if we want to
         if head_mask is not None:
-            attn_prob = attn_prob * torch.einsum('ijbn->bnij', head_mask)
+            attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)
 
         # attention output
-        attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, v_head_h)
+        attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
 
         if self.output_attentions:
-            return attn_vec, torch.einsum('bnij->ijbn', attn_prob)
+            return attn_vec, torch.einsum("bnij->ijbn", attn_prob)
 
         return attn_vec
 
     def post_attention(self, h, attn_vec, residual=True):
         """Post-attention processing."""
         # post-attention projection (back to `d_model`)
-        attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+        attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)
 
         attn_out = self.dropout(attn_out)
         if residual:
@@ -305,10 +328,7 @@ class XLNetRelativeAttention(nn.Module):
 
         return output
 
-    def forward(self, h, g,
-                      attn_mask_h, attn_mask_g,
-                      r, seg_mat,
-                      mems=None, target_mapping=None, head_mask=None):
+    def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None):
         if g is not None:
             ###### Two-stream attention with relative positional encoding.
             # content based attention score
@@ -318,21 +338,22 @@ class XLNetRelativeAttention(nn.Module):
                 cat = h
 
             # content-based key head
-            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
 
             # content-based value head
-            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # position-based key head
-            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
 
             ##### h-stream
             # content-stream query head
-            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
 
             # core attention ops
             attn_vec_h = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
+            )
 
             if self.output_attentions:
                 attn_vec_h, attn_prob_h = attn_vec_h
@@ -342,21 +363,23 @@ class XLNetRelativeAttention(nn.Module):
 
             ##### g-stream
             # query-stream query head
-            q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q)
+            q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q)
 
             # core attention ops
             if target_mapping is not None:
-                q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
 
-                attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+                attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
             else:
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
+                )
 
                 if self.output_attentions:
                     attn_vec_g, attn_prob_g = attn_vec_g
@@ -375,16 +398,17 @@ class XLNetRelativeAttention(nn.Module):
                 cat = h
 
             # content heads
-            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
-            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
-            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
 
             # positional heads
-            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
 
             # core attention ops
             attn_vec = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
+            )
 
             if self.output_attentions:
                 attn_vec, attn_prob = attn_vec
@@ -398,6 +422,7 @@ class XLNetRelativeAttention(nn.Module):
             outputs = outputs + (attn_prob,)
         return outputs
 
+
 class XLNetFeedForward(nn.Module):
     def __init__(self, config):
         super(XLNetFeedForward, self).__init__()
@@ -405,8 +430,9 @@ class XLNetFeedForward(nn.Module):
         self.layer_1 = nn.Linear(config.d_model, config.d_inner)
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or \
-                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+        if isinstance(config.ff_activation, str) or (
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+        ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
             self.activation_function = config.ff_activation
@@ -421,6 +447,7 @@ class XLNetFeedForward(nn.Module):
         output = self.layer_norm(output + inp)
         return output
 
+
 class XLNetLayer(nn.Module):
     def __init__(self, config):
         super(XLNetLayer, self).__init__()
@@ -428,12 +455,20 @@ class XLNetLayer(nn.Module):
         self.ff = XLNetFeedForward(config)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, output_h, output_g,
-                attn_mask_h, attn_mask_g,
-                r, seg_mat, mems=None, target_mapping=None, head_mask=None):
-        outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g,
-                                r, seg_mat, mems=mems, target_mapping=target_mapping,
-                                head_mask=head_mask)
+    def forward(
+        self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None
+    ):
+        outputs = self.rel_attn(
+            output_h,
+            output_g,
+            attn_mask_h,
+            attn_mask_g,
+            r,
+            seg_mat,
+            mems=mems,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+        )
         output_h, output_g = outputs[:2]
 
         if output_g is not None:
@@ -448,6 +483,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     config_class = XLNetConfig
     pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xlnet
@@ -466,12 +502,20 @@ class XLNetPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, XLNetRelativeAttention):
-            for param in [module.q, module.k, module.v, module.o, module.r,
-                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
-                          module.seg_embed]:
+            for param in [
+                module.q,
+                module.k,
+                module.v,
+                module.o,
+                module.r,
+                module.r_r_bias,
+                module.r_s_bias,
+                module.r_w_bias,
+                module.seg_embed,
+            ]:
                 param.data.normal_(mean=0.0, std=self.config.initializer_range)
         elif isinstance(module, XLNetModel):
-                module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
 
 
 XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
@@ -564,8 +608,12 @@ XLNET_INPUTS_DOCSTRING = r"""
             than the model's internal embedding lookup matrix.
 """
 
-@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
-                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetModel(XLNetPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -594,6 +642,7 @@ class XLNetModel(XLNetPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
+
     def __init__(self, config):
         super(XLNetModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -658,18 +707,18 @@ class XLNetModel(XLNetPreTrainedModel):
     def cache_mem(self, curr_out, prev_mem):
         """cache hidden states into memory."""
         if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[:self.reuse_len]
+            curr_out = curr_out[: self.reuse_len]
 
         if prev_mem is None:
-            new_mem = curr_out[-self.mem_len:]
+            new_mem = curr_out[-self.mem_len :]
         else:
-            new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
+            new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :]
 
         return new_mem.detach()
 
     @staticmethod
     def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
+        sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
         pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
         pos_emb = pos_emb[:, None, :]
 
@@ -683,14 +732,14 @@ class XLNetModel(XLNetPreTrainedModel):
         freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
         inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
 
-        if self.attn_type == 'bi':
+        if self.attn_type == "bi":
             # beg, end = klen - 1, -qlen
             beg, end = klen, -qlen
-        elif self.attn_type == 'uni':
+        elif self.attn_type == "uni":
             # beg, end = klen - 1, -1
             beg, end = klen, -1
         else:
-            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
 
         if self.bi_data:
             fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
@@ -701,8 +750,8 @@ class XLNetModel(XLNetPreTrainedModel):
                 bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
 
             if bsz is not None:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
             else:
                 fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
                 bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
@@ -717,8 +766,18 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
@@ -739,7 +798,6 @@ class XLNetModel(XLNetPreTrainedModel):
         perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
 
-
         mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
         klen = mlen + qlen
 
@@ -748,13 +806,13 @@ class XLNetModel(XLNetPreTrainedModel):
 
         ##### Attention mask
         # causal attention mask
-        if self.attn_type == 'uni':
+        if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
             attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == 'bi':
+        elif self.attn_type == "bi":
             attn_mask = None
         else:
-            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
 
         # data mask: input mask & perm mask
         assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
@@ -799,9 +857,9 @@ class XLNetModel(XLNetPreTrainedModel):
         output_h = self.dropout(word_emb_k)
         if target_mapping is not None:
             word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
-        # else:  # We removed the inp_q input which was same as target mapping
-        #     inp_q_ext = inp_q[:, :, None]
-        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q)
         else:
             output_g = None
@@ -836,7 +894,9 @@ class XLNetModel(XLNetPreTrainedModel):
                 head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.n_layer
 
@@ -853,9 +913,17 @@ class XLNetModel(XLNetPreTrainedModel):
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
-            outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
-                                   r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping,
-                                   head_mask=head_mask[i])
+            outputs = layer_module(
+                output_h,
+                output_g,
+                attn_mask_h=non_tgt_mask,
+                attn_mask_g=attn_mask,
+                r=pos_emb,
+                seg_mat=seg_mat,
+                mems=mems[i],
+                target_mapping=target_mapping,
+                head_mask=head_mask[i],
+            )
             output_h, output_g = outputs[:2]
             if self.output_attentions:
                 attentions.append(outputs[2])
@@ -881,7 +949,9 @@ class XLNetModel(XLNetPreTrainedModel):
         if self.output_attentions:
             if target_mapping is not None:
                 # when target_mapping is provided, there are 2-tuple of attentions
-                attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
+                attentions = tuple(
+                    tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions
+                )
             else:
                 attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
             outputs = outputs + (attentions,)
@@ -889,9 +959,12 @@ class XLNetModel(XLNetPreTrainedModel):
         return outputs  # outputs, (new_mems), (hidden_states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a language modeling head on top
+@add_start_docstrings(
+    """XLNet Model with a language modeling head on top
     (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetLMHeadModel(XLNetPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -934,6 +1007,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
+
     def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
         self.attn_type = config.attn_type
@@ -954,34 +1028,42 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         # Build permutation mask so that previous tokens don't see last token
         perm_mask = torch.zeros(
-            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]),
-            dtype=torch.float, device=input_ids.device
+            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=input_ids.device
         )
         perm_mask[:, :, -1] = 1.0
 
         # We'll only predict the last token
         target_mapping = torch.zeros(
-            (input_ids.shape[0], 1, input_ids.shape[1]),
-            dtype=torch.float, device=input_ids.device
+            (input_ids.shape[0], 1, input_ids.shape[1]), dtype=torch.float, device=input_ids.device
         )
         target_mapping[0, 0, -1] = 1.0
 
-        return {"input_ids": input_ids,
-                "perm_mask": perm_mask,
-                "target_mapping": target_mapping
-               }
-
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+        return {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         logits = self.lm_loss(transformer_outputs[0])
 
@@ -990,16 +1072,18 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            labels.view(-1))
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
             outputs = (loss,) + outputs
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1037,6 +1121,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         loss, logits = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1047,17 +1132,30 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
@@ -1077,10 +1175,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
-@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+
+@add_start_docstrings(
+    """XLNet Model with a token classification head on top (a linear layer on top of
                       the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-                      XLNET_START_DOCSTRING,
-                      XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForTokenClassification(XLNetPreTrainedModel):
     r"""
     Inputs:
@@ -1135,6 +1236,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
         scores = outputs[0]
 
     """
+
     def __init__(self, config):
         super(XLNetForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1144,18 +1246,31 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            mems=mems,
-                            perm_mask=perm_mask,
-                            target_mapping=target_mapping,
-                            token_type_ids=token_type_ids,
-                            input_mask=input_mask,
-                            head_mask=head_mask,
-                            inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1177,9 +1292,12 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """XLNet Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForMultipleChoice(XLNetPreTrainedModel):
     r"""
     Inputs:
@@ -1239,6 +1357,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         loss, classification_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForMultipleChoice, self).__init__(config)
 
@@ -1248,9 +1367,19 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        input_mask=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        labels=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1258,18 +1387,26 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
         flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
 
-        transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
-                                               input_mask=flat_input_mask, attention_mask=flat_attention_mask,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
-                                               head_mask=head_mask, inputs_embeds=inputs_embeds)
-
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            input_mask=flat_input_mask,
+            attention_mask=flat_attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
         reshaped_logits = logits.view(-1, num_choices)
-        outputs = (reshaped_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (reshaped_logits,) + transformer_outputs[
+            1:
+        ]  # Keep mems, hidden states, attentions if there are in it
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
@@ -1279,9 +1416,12 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1325,6 +1465,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForQuestionAnsweringSimple, self).__init__(config)
         self.num_labels = config.num_labels
@@ -1334,19 +1475,32 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None):
-
-        outputs = self.transformer(input_ids,
-                                    attention_mask=attention_mask,
-                                    mems=mems,
-                                    perm_mask=perm_mask,
-                                    target_mapping=target_mapping,
-                                    token_type_ids=token_type_ids,
-                                    input_mask=input_mask,
-                                    head_mask=head_mask,
-                                    inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
 
         sequence_output = outputs[0]
 
@@ -1376,9 +1530,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
         return outputs  # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+    XLNET_START_DOCSTRING,
+    XLNET_INPUTS_DOCSTRING,
+)
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1440,6 +1597,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         loss, start_scores, end_scores = outputs[:2]
 
     """
+
     def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
         self.start_n_top = config.start_n_top
@@ -1452,18 +1610,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
-                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
-        transformer_outputs = self.transformer(input_ids,
-                                               attention_mask=attention_mask,
-                                               mems=mems,
-                                               perm_mask=perm_mask,
-                                               target_mapping=target_mapping,
-                                               token_type_ids=token_type_ids,
-                                               input_mask=input_mask,
-                                               head_mask=head_mask,
-                                               inputs_embeds=inputs_embeds)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        is_impossible=None,
+        cls_index=None,
+        p_mask=None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
         hidden_states = transformer_outputs[0]
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
 
@@ -1497,24 +1671,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         else:
             # during inference, compute the end logits based on beam search
             bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
 
-            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
 
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)  # get the representation of START as weighted sum of hidden states
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
+            start_states = torch.einsum(
+                "blh,bl->bh", hidden_states, start_log_probs
+            )  # get the representation of START as weighted sum of hidden states
+            cls_logits = self.answer_class(
+                hidden_states, start_states=start_states, cls_index=cls_index
+            )  # Shape (batch size,): one single `cls_logits` for each sample
 
             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
 
diff --git a/transformers/optimization.py b/transformers/optimization.py
index 99e6cc75e..0cd57078b 100644
--- a/transformers/optimization.py
+++ b/transformers/optimization.py
@@ -34,10 +34,11 @@ def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1
     """ Create a schedule with a constant learning rate preceded by a warmup
     period during which the learning rate increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.
+        return 1.0
 
     return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -46,40 +47,47 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
     """ Create a schedule with a learning rate that decreases linearly after
     linearly increasing during a warmup period.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
-        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
     """ Create a schedule with a learning rate that decreases following the
     values of the cosine function between 0 and `pi * cycles` after a warmup
     period during which it increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
+):
     """ Create a schedule with a learning rate that decreases following the
     values of the cosine function with several hard restarts, after a warmup
     period during which it increases linearly between 0 and 1.
     """
+
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.:
-            return 0.
-        return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
@@ -94,17 +102,17 @@ class AdamW(Optimizer):
         weight_decay (float): Weight decay. Default: 0.0
         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
     """
+
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
         if lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if not 0.0 <= betas[0] < 1.0:
             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
-        if not 0.0 <= betas[1]  < 1.0:
+        if not 0.0 <= betas[1] < 1.0:
             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
         if not 0.0 <= eps:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
-                        correct_bias=correct_bias)
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
         super(AdamW, self).__init__(params, defaults)
 
     def step(self, closure=None):
@@ -119,38 +127,38 @@ class AdamW(Optimizer):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
+                    state["step"] = 0
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state["exp_avg"] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
+                state["step"] += 1
 
                 # Decay the first and second moment running average coefficient
                 # In-place operations to update the averages at the same time
                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
 
-                step_size = group['lr']
-                if group['correct_bias']:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1 ** state['step']
-                    bias_correction2 = 1.0 - beta2 ** state['step']
+                step_size = group["lr"]
+                if group["correct_bias"]:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1 ** state["step"]
+                    bias_correction2 = 1.0 - beta2 ** state["step"]
                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg, denom)
@@ -163,7 +171,7 @@ class AdamW(Optimizer):
                 # with the m/v parameters. This is equivalent to adding the square
                 # of the weights to the loss with plain (non-momentum) SGD.
                 # Add weight decay at the end (fixed version)
-                if group['weight_decay'] > 0.0:
-                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
+                if group["weight_decay"] > 0.0:
+                    p.data.add_(-group["lr"] * group["weight_decay"], p.data)
 
         return loss
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
index c5fa24808..bdcbd323c 100644
--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
@@ -24,70 +24,64 @@ import tensorflow as tf
 
 
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Applys a warmup schedule on a given learning rate decay schedule."""
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_schedule_fn,
-      warmup_steps,
-      power=1.0,
-      name=None):
-    super(WarmUp, self).__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.warmup_steps = warmup_steps
-    self.power = power
-    self.decay_schedule_fn = decay_schedule_fn
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or 'WarmUp') as name:
-      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-      # learning rate will be `global_step/num_warmup_steps * init_lr`.
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = (
-          self.initial_learning_rate *
-          tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(global_step_float < warmup_steps_float,
-                     lambda: warmup_learning_rate,
-                     lambda: self.decay_schedule_fn(step),
-                     name=name)
-
-  def get_config(self):
-    return {
-        'initial_learning_rate': self.initial_learning_rate,
-        'decay_schedule_fn': self.decay_schedule_fn,
-        'warmup_steps': self.warmup_steps,
-        'power': self.power,
-        'name': self.name
-    }
+    """Applys a warmup schedule on a given learning rate decay schedule."""
+
+    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
+        super(WarmUp, self).__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.warmup_steps = warmup_steps
+        self.power = power
+        self.decay_schedule_fn = decay_schedule_fn
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "WarmUp") as name:
+            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+            # learning rate will be `global_step/num_warmup_steps * init_lr`.
+            global_step_float = tf.cast(step, tf.float32)
+            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+            warmup_percent_done = global_step_float / warmup_steps_float
+            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
+            return tf.cond(
+                global_step_float < warmup_steps_float,
+                lambda: warmup_learning_rate,
+                lambda: self.decay_schedule_fn(step),
+                name=name,
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_schedule_fn": self.decay_schedule_fn,
+            "warmup_steps": self.warmup_steps,
+            "power": self.power,
+            "name": self.name,
+        }
 
 
 def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
-  """Creates an optimizer with learning rate schedule."""
-  # Implements linear decay of the learning rate.
-  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr,
-      decay_steps=num_train_steps,
-      end_learning_rate=0.0)
-  if num_warmup_steps:
-    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
-                              decay_schedule_fn=learning_rate_fn,
-                              warmup_steps=num_warmup_steps)
-  optimizer = AdamWeightDecay(
-      learning_rate=learning_rate_fn,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=['layer_norm', 'bias'])
-  return optimizer
+    """Creates an optimizer with learning rate schedule."""
+    # Implements linear decay of the learning rate.
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0
+    )
+    if num_warmup_steps:
+        learning_rate_fn = WarmUp(
+            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
+        )
+    optimizer = AdamWeightDecay(
+        learning_rate=learning_rate_fn,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["layer_norm", "bias"],
+    )
+    return optimizer
 
 
 class AdamWeightDecay(tf.keras.optimizers.Adam):
-  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
 
   Just adding the square of the weights to the loss function is *not* the
   correct way of using L2 regularization/weight decay with Adam, since that will
@@ -98,99 +92,94 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
   the loss with plain (non-momentum) SGD.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               weight_decay_rate=0.0,
-               include_in_weight_decay=None,
-               exclude_from_weight_decay=None,
-               name='AdamWeightDecay',
-               **kwargs):
-    super(AdamWeightDecay, self).__init__(
-        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
-    self.weight_decay_rate = weight_decay_rate
-    self._include_in_weight_decay = include_in_weight_decay
-    self._exclude_from_weight_decay = exclude_from_weight_decay
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates an optimizer from its config with WarmUp custom object."""
-    custom_objects = {'WarmUp': WarmUp}
-    return super(AdamWeightDecay, cls).from_config(
-        config, custom_objects=custom_objects)
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
-                                                apply_state)
-    apply_state['weight_decay_rate'] = tf.constant(
-        self.weight_decay_rate, name='adam_weight_decay_rate')
-
-  def _decay_weights_op(self, var, learning_rate, apply_state):
-    do_decay = self._do_use_weight_decay(var.name)
-    if do_decay:
-      return var.assign_sub(
-          learning_rate * var *
-          apply_state['weight_decay_rate'],
-          use_locking=self._use_locking)
-    return tf.no_op()
-
-  def apply_gradients(self, grads_and_vars, clip_norm, name=None):
-    grads, tvars = list(zip(*grads_and_vars))
-    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
-    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
-
-  def _get_lr(self, var_device, var_dtype, apply_state):
-    """Retrieves the learning rate with the given state."""
-    if apply_state is None:
-      return self._decayed_lr_t[var_dtype], {}
-
-    apply_state = apply_state or {}
-    coefficients = apply_state.get((var_device, var_dtype))
-    if coefficients is None:
-      coefficients = self._fallback_apply_state(var_device, var_dtype)
-      apply_state[(var_device, var_dtype)] = coefficients
-
-    return coefficients['lr_t'], dict(apply_state=apply_state)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_dense(
-          grad, var, **kwargs)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_sparse(
-          grad, var, indices, **kwargs)
-
-  def get_config(self):
-    config = super(AdamWeightDecay, self).get_config()
-    config.update({
-        'weight_decay_rate': self.weight_decay_rate,
-    })
-    return config
-
-  def _do_use_weight_decay(self, param_name):
-    """Whether to use L2 weight decay for `param_name`."""
-    if self.weight_decay_rate == 0:
-      return False
-
-    if self._include_in_weight_decay:
-      for r in self._include_in_weight_decay:
-        if re.search(r, param_name) is not None:
-          return True
-
-    if self._exclude_from_weight_decay:
-      for r in self._exclude_from_weight_decay:
-        if re.search(r, param_name) is not None:
-          return False
-    return True
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        weight_decay_rate=0.0,
+        include_in_weight_decay=None,
+        exclude_from_weight_decay=None,
+        name="AdamWeightDecay",
+        **kwargs
+    ):
+        super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+        self.weight_decay_rate = weight_decay_rate
+        self._include_in_weight_decay = include_in_weight_decay
+        self._exclude_from_weight_decay = exclude_from_weight_decay
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates an optimizer from its config with WarmUp custom object."""
+        custom_objects = {"WarmUp": WarmUp}
+        return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
+        apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
+
+    def _decay_weights_op(self, var, learning_rate, apply_state):
+        do_decay = self._do_use_weight_decay(var.name)
+        if do_decay:
+            return var.assign_sub(
+                learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
+            )
+        return tf.no_op()
+
+    def apply_gradients(self, grads_and_vars, clip_norm, name=None):
+        grads, tvars = list(zip(*grads_and_vars))
+        (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
+        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
+
+    def _get_lr(self, var_device, var_dtype, apply_state):
+        """Retrieves the learning rate with the given state."""
+        if apply_state is None:
+            return self._decayed_lr_t[var_dtype], {}
+
+        apply_state = apply_state or {}
+        coefficients = apply_state.get((var_device, var_dtype))
+        if coefficients is None:
+            coefficients = self._fallback_apply_state(var_device, var_dtype)
+            apply_state[(var_device, var_dtype)] = coefficients
+
+        return coefficients["lr_t"], dict(apply_state=apply_state)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
+
+    def get_config(self):
+        config = super(AdamWeightDecay, self).get_config()
+        config.update(
+            {"weight_decay_rate": self.weight_decay_rate,}
+        )
+        return config
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if self.weight_decay_rate == 0:
+            return False
+
+        if self._include_in_weight_decay:
+            for r in self._include_in_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return True
+
+        if self._exclude_from_weight_decay:
+            for r in self._exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
 
 
 ## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
@@ -201,10 +190,8 @@ class GradientAccumulator(object):
         """Initializes the accumulator."""
         self._gradients = []
         self._accum_steps = tf.Variable(
-            initial_value=0,
-            dtype=tf.int64,
-            trainable=False,
-            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+            initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+        )
 
     @property
     def step(self):
@@ -214,12 +201,19 @@ class GradientAccumulator(object):
     @property
     def gradients(self):
         """The accumulated gradients."""
-        return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+        return list(
+            gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
+        )
 
     def __call__(self, gradients):
         """Accumulates :obj:`gradients`."""
         if not self._gradients:
-            self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
+            self._gradients.extend(
+                [
+                    tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
+                    for gradient in gradients
+                ]
+            )
 
         if len(gradients) != len(self._gradients):
             raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
@@ -249,6 +243,9 @@ class GradientAccumulator(object):
             if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
                 return self._gradients
 
-            return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
+            return (
+                gradient.device_map.select_for_current_replica(gradient.values, replica_context)
+                for gradient in self._gradients
+            )
         else:
             return self._gradients
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index f4bf3da68..4149c2e47 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -30,25 +30,42 @@ from typing import Union, Optional, Tuple, List, Dict
 
 import numpy as np
 
-from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
-                          PretrainedConfig, ModelCard, SquadExample,
-                          squad_convert_examples_to_features, is_tf_available,
-                          is_torch_available, BasicTokenizer,
-                          ALL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PretrainedConfig,
+    ModelCard,
+    SquadExample,
+    squad_convert_examples_to_features,
+    is_tf_available,
+    is_torch_available,
+    BasicTokenizer,
+    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+)
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \
-        TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification
+    from transformers import (
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForTokenClassification,
+    )
 
 if is_torch_available():
     import torch
-    from transformers import AutoModel, AutoModelForSequenceClassification, \
-        AutoModelForQuestionAnswering, AutoModelForTokenClassification
+    from transformers import (
+        AutoModel,
+        AutoModelForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        AutoModelForTokenClassification,
+    )
 
 
 logger = logging.getLogger(__name__)
 
+
 def get_framework(model=None):
     """ Select framework (TensorFlow/PyTorch) to use.
         If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
@@ -56,20 +73,24 @@ def get_framework(model=None):
     if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
         # Both framework are available but the use supplied a model class instance.
         # Try to guess which framework to use from the model classname
-        framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt'
+        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
     elif not is_tf_available() and not is_torch_available():
-        raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. "
-                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
-                          "To install PyTorch, read the instructions at https://pytorch.org/.")
+        raise ImportError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
     else:
         # framework = 'tf' if is_tf_available() else 'pt'
-        framework = 'pt' if is_torch_available() else 'tf'
+        framework = "pt" if is_torch_available() else "tf"
     return framework
 
+
 class ArgumentHandler(ABC):
     """
     Base interface for handling varargs for each Pipeline
     """
+
     @abstractmethod
     def __call__(self, *args, **kwargs):
         raise NotImplementedError()
@@ -79,11 +100,12 @@ class DefaultArgumentHandler(ArgumentHandler):
     """
     Default varargs argument parser handling parameters for each Pipeline
     """
+
     def __call__(self, *args, **kwargs):
-        if 'X' in kwargs:
-            return kwargs['X']
-        elif 'data' in kwargs:
-            return kwargs['data']
+        if "X" in kwargs:
+            return kwargs["X"]
+        elif "data" in kwargs:
+            return kwargs["data"]
         elif len(args) == 1:
             if isinstance(args[0], list):
                 return args[0]
@@ -91,7 +113,7 @@ class DefaultArgumentHandler(ArgumentHandler):
                 return [args[0]]
         elif len(args) > 1:
             return list(args)
-        raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)')
+        raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)")
 
 
 class PipelineDataFormat:
@@ -105,24 +127,25 @@ class PipelineDataFormat:
     PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
     to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
     """
-    SUPPORTED_FORMATS = ['json', 'csv', 'pipe']
+
+    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
 
     def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
         self.output_path = output_path
         self.input_path = input_path
-        self.column = column.split(',') if column is not None else ['']
+        self.column = column.split(",") if column is not None else [""]
         self.is_multi_columns = len(self.column) > 1
 
         if self.is_multi_columns:
-            self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column]
+            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
 
         if output_path is not None and not overwrite:
             if exists(abspath(self.output_path)):
-                raise OSError('{} already exists on disk'.format(self.output_path))
+                raise OSError("{} already exists on disk".format(self.output_path))
 
         if input_path is not None:
             if not exists(abspath(self.input_path)):
-                raise OSError('{} doesnt exist on disk'.format(self.input_path))
+                raise OSError("{} doesnt exist on disk".format(self.input_path))
 
     @abstractmethod
     def __iter__(self):
@@ -144,23 +167,25 @@ class PipelineDataFormat:
         :return: (str) Path where the data has been saved
         """
         path, _ = os.path.splitext(self.output_path)
-        binary_path = os.path.extsep.join((path, 'pickle'))
+        binary_path = os.path.extsep.join((path, "pickle"))
 
-        with open(binary_path, 'wb+') as f_output:
+        with open(binary_path, "wb+") as f_output:
             pickle.dump(data, f_output)
 
         return binary_path
 
     @staticmethod
-    def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
-        if format == 'json':
+    def from_str(
+        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False
+    ):
+        if format == "json":
             return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == 'csv':
+        elif format == "csv":
             return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == 'pipe':
+        elif format == "pipe":
             return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
         else:
-            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format))
+            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
 
 
 class CsvPipelineDataFormat(PipelineDataFormat):
@@ -168,7 +193,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
     def __iter__(self):
-        with open(self.input_path, 'r') as f:
+        with open(self.input_path, "r") as f:
             reader = csv.DictReader(f)
             for row in reader:
                 if self.is_multi_columns:
@@ -177,7 +202,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
                     yield row[self.column[0]]
 
     def save(self, data: List[dict]):
-        with open(self.output_path, 'w') as f:
+        with open(self.output_path, "w") as f:
             if len(data) > 0:
                 writer = csv.DictWriter(f, list(data[0].keys()))
                 writer.writeheader()
@@ -188,7 +213,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
         super().__init__(output_path, input_path, column, overwrite=overwrite)
 
-        with open(input_path, 'r') as f:
+        with open(input_path, "r") as f:
             self._entries = json.load(f)
 
     def __iter__(self):
@@ -199,7 +224,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
                 yield entry[self.column[0]]
 
     def save(self, data: dict):
-        with open(self.output_path, 'w') as f:
+        with open(self.output_path, "w") as f:
             json.dump(data, f)
 
 
@@ -210,12 +235,13 @@ class PipedPipelineDataFormat(PipelineDataFormat):
 
     If columns are provided, then the output will be a dictionary with {column_x: value_x}
     """
+
     def __iter__(self):
         for line in sys.stdin:
             # Split for multi-columns
-            if '\t' in line:
+            if "\t" in line:
 
-                line = line.split('\t')
+                line = line.split("\t")
                 if self.column:
                     # Dictionary to map arguments
                     yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
@@ -232,8 +258,8 @@ class PipedPipelineDataFormat(PipelineDataFormat):
     def save_binary(self, data: Union[dict, List[dict]]) -> str:
         if self.output_path is None:
             raise KeyError(
-                'When using piped input on pipeline outputting large object requires an output file path. '
-                'Please provide such output path through --output argument.'
+                "When using piped input on pipeline outputting large object requires an output file path. "
+                "Please provide such output path through --output argument."
             )
 
         return super().save_binary(data)
@@ -298,10 +324,16 @@ class Pipeline(_ScikitCompat):
 
     default_input_names = None
 
-    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None, framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None, device: int = -1,
-                 binary_output: bool = False):
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+        binary_output: bool = False,
+    ):
 
         if framework is None:
             framework = get_framework()
@@ -315,8 +347,8 @@ class Pipeline(_ScikitCompat):
         self._args_parser = args_parser or DefaultArgumentHandler()
 
         # Special handling
-        if self.device >= 0 and self.framework == 'pt':
-            self.model = self.model.to('cuda:{}'.format(self.device))
+        if self.device >= 0 and self.framework == "pt":
+            self.model = self.model.to("cuda:{}".format(self.device))
 
     def save_pretrained(self, save_directory):
         """
@@ -356,8 +388,8 @@ class Pipeline(_ScikitCompat):
         Returns:
             Context manager
         """
-        if self.framework == 'tf':
-            with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)):
+        if self.framework == "tf":
+            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
                 yield
         else:
             if self.device >= 0:
@@ -372,11 +404,11 @@ class Pipeline(_ScikitCompat):
         Returns:
             dict holding all the required parameters for model's forward
         """
-        args = ['input_ids', 'attention_mask']
+        args = ["input_ids", "attention_mask"]
         model_type = type(self.model).__name__.lower()
 
-        if 'distilbert' not in model_type and 'xlm' not in model_type:
-            args += ['token_type_ids']
+        if "distilbert" not in model_type and "xlm" not in model_type:
+            args += ["token_type_ids"]
 
         # PR #1548 (CLI) There is an issue with attention_mask
         # if 'xlnet' in model_type or 'xlm' in model_type:
@@ -394,9 +426,7 @@ class Pipeline(_ScikitCompat):
         # Encode for forward
         with self.device_placement():
             inputs = self.tokenizer.batch_encode_plus(
-                inputs, add_special_tokens=True,
-                return_tensors=self.framework,
-                max_length=self.tokenizer.max_len
+                inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len
             )
 
             # Filter out features not available on specific models
@@ -411,7 +441,7 @@ class Pipeline(_ScikitCompat):
         Returns:
             Numpy array
         """
-        if self.framework == 'tf':
+        if self.framework == "tf":
             # TODO trace model
             predictions = self.model(inputs, training=False)[0]
         else:
@@ -426,19 +456,24 @@ class FeatureExtractionPipeline(Pipeline):
     Feature extraction pipeline using Model head.
     """
 
-    def __init__(self, model,
-                 tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None,
-                 framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None,
-                 device: int = -1):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=args_parser,
-                         device=device,
-                         binary_output=True)
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=args_parser,
+            device=device,
+            binary_output=True,
+        )
 
     def __call__(self, *args, **kwargs):
         return super().__call__(*args, **kwargs).tolist()
@@ -452,7 +487,7 @@ class TextClassificationPipeline(Pipeline):
     def __call__(self, *args, **kwargs):
         outputs = super().__call__(*args, **kwargs)
         scores = np.exp(outputs) / np.exp(outputs).sum(-1)
-        return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores]
+        return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
 
 
 class NerPipeline(Pipeline):
@@ -460,19 +495,28 @@ class NerPipeline(Pipeline):
     Named Entity Recognition pipeline using ModelForTokenClassification head.
     """
 
-    default_input_names = 'sequences'
-
-    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
-                 modelcard: ModelCard = None, framework: Optional[str] = None,
-                 args_parser: ArgumentHandler = None, device: int = -1,
-                 binary_output: bool = False, ignore_labels=['O']):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=args_parser,
-                         device=device,
-                         binary_output=binary_output)
+    default_input_names = "sequences"
+
+    def __init__(
+        self,
+        model,
+        tokenizer: PreTrainedTokenizer = None,
+        modelcard: ModelCard = None,
+        framework: Optional[str] = None,
+        args_parser: ArgumentHandler = None,
+        device: int = -1,
+        binary_output: bool = False,
+        ignore_labels=["O"],
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=args_parser,
+            device=device,
+            binary_output=binary_output,
+        )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
         self.ignore_labels = ignore_labels
@@ -485,19 +529,20 @@ class NerPipeline(Pipeline):
             with self.device_placement():
 
                 tokens = self.tokenizer.encode_plus(
-                    sentence, return_attention_mask=False,
+                    sentence,
+                    return_attention_mask=False,
                     return_tensors=self.framework,
-                    max_length=self.tokenizer.max_len
+                    max_length=self.tokenizer.max_len,
                 )
 
                 # Forward
-                if self.framework == 'tf':
+                if self.framework == "tf":
                     entities = self.model(tokens)[0][0].numpy()
-                    input_ids = tokens['input_ids'].numpy()[0]
+                    input_ids = tokens["input_ids"].numpy()[0]
                 else:
                     with torch.no_grad():
                         entities = self.model(**tokens)[0][0].cpu().numpy()
-                        input_ids = tokens['input_ids'].cpu().numpy()[0]
+                        input_ids = tokens["input_ids"].cpu().numpy()[0]
 
             score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
             labels_idx = score.argmax(axis=-1)
@@ -505,11 +550,13 @@ class NerPipeline(Pipeline):
             answer = []
             for idx, label_idx in enumerate(labels_idx):
                 if self.model.config.id2label[label_idx] not in self.ignore_labels:
-                    answer += [{
-                        'word': self.tokenizer.decode([int(input_ids[idx])]),
-                        'score': score[idx][label_idx].item(),
-                        'entity': self.model.config.id2label[label_idx]
-                    }]
+                    answer += [
+                        {
+                            "word": self.tokenizer.decode([int(input_ids[idx])]),
+                            "score": score[idx][label_idx].item(),
+                            "entity": self.model.config.id2label[label_idx],
+                        }
+                    ]
 
             # Append
             answers += [answer]
@@ -526,18 +573,19 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
     QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
     arguments.
     """
+
     def __call__(self, *args, **kwargs):
         # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
         if args is not None and len(args) > 0:
             if len(args) == 1:
-                kwargs['X'] = args[0]
+                kwargs["X"] = args[0]
             else:
-                kwargs['X'] = list(args)
+                kwargs["X"] = list(args)
 
         # Generic compatibility with sklearn and Keras
         # Batched data
-        if 'X' in kwargs or 'data' in kwargs:
-            inputs = kwargs['X'] if 'X' in kwargs else kwargs['data']
+        if "X" in kwargs or "data" in kwargs:
+            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
 
             if isinstance(inputs, dict):
                 inputs = [inputs]
@@ -547,28 +595,31 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 
             for i, item in enumerate(inputs):
                 if isinstance(item, dict):
-                    if any(k not in item for k in ['question', 'context']):
-                        raise KeyError('You need to provide a dictionary with keys {question:..., context:...}')
+                    if any(k not in item for k in ["question", "context"]):
+                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
 
                     inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
 
                 elif not isinstance(item, SquadExample):
                     raise ValueError(
-                        '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
-                            .format('X' if 'X' in kwargs else 'data')
+                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
+                            "X" if "X" in kwargs else "data"
+                        )
                     )
 
             # Tabular input
-        elif 'question' in kwargs and 'context' in kwargs:
-            if isinstance(kwargs['question'], str):
-                kwargs['question'] = [kwargs['question']]
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], str):
+                kwargs["question"] = [kwargs["question"]]
 
-            if isinstance(kwargs['context'], str):
-                kwargs['context'] = [kwargs['context']]
+            if isinstance(kwargs["context"], str):
+                kwargs["context"] = [kwargs["context"]]
 
-            inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])]
+            inputs = [
+                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
+            ]
         else:
-            raise ValueError('Unknown arguments {}'.format(kwargs))
+            raise ValueError("Unknown arguments {}".format(kwargs))
 
         if not isinstance(inputs, list):
             inputs = [inputs]
@@ -581,22 +632,31 @@ class QuestionAnsweringPipeline(Pipeline):
     Question Answering pipeline using ModelForQuestionAnswering head.
     """
 
-    default_input_names = 'question,context'
-
-    def __init__(self, model,
-                 tokenizer: Optional[PreTrainedTokenizer],
-                 modelcard: Optional[ModelCard],
-                 framework: Optional[str] = None,
-                 device: int = -1, **kwargs):
-        super().__init__(model=model,
-                         tokenizer=tokenizer,
-                         modelcard=modelcard,
-                         framework=framework,
-                         args_parser=QuestionAnsweringArgumentHandler(),
-                         device=device, **kwargs)
+    default_input_names = "question,context"
+
+    def __init__(
+        self,
+        model,
+        tokenizer: Optional[PreTrainedTokenizer],
+        modelcard: Optional[ModelCard],
+        framework: Optional[str] = None,
+        device: int = -1,
+        **kwargs
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            args_parser=QuestionAnsweringArgumentHandler(),
+            device=device,
+            **kwargs
+        )
 
     @staticmethod
-    def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]:
+    def create_sample(
+        question: Union[str, List[str]], context: Union[str, List[str]]
+    ) -> Union[SquadExample, List[SquadExample]]:
         """
         QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
         This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
@@ -629,26 +689,28 @@ class QuestionAnsweringPipeline(Pipeline):
             end: the character index in the original string corresponding to the ending of the answer' span
         """
         # Set defaults values
-        kwargs.setdefault('topk', 1)
-        kwargs.setdefault('doc_stride', 128)
-        kwargs.setdefault('max_answer_len', 15)
-        kwargs.setdefault('max_seq_len', 384)
-        kwargs.setdefault('max_question_len', 64)
+        kwargs.setdefault("topk", 1)
+        kwargs.setdefault("doc_stride", 128)
+        kwargs.setdefault("max_answer_len", 15)
+        kwargs.setdefault("max_seq_len", 384)
+        kwargs.setdefault("max_question_len", 64)
 
-        if kwargs['topk'] < 1:
-            raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
+        if kwargs["topk"] < 1:
+            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
 
-        if kwargs['max_answer_len'] < 1:
-            raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
+        if kwargs["max_answer_len"] < 1:
+            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
 
         # Convert inputs to features
         examples = self._args_parser(*texts, **kwargs)
-        features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False)
+        features = squad_convert_examples_to_features(
+            examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
+        )
         fw_args = self.inputs_for_model([f.__dict__ for f in features])
 
         # Manage tensor allocation on correct device
         with self.device_placement():
-            if self.framework == 'tf':
+            if self.framework == "tf":
                 fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                 start, end = self.model(fw_args)
                 start, end = start.numpy(), end.numpy()
@@ -672,16 +734,18 @@ class QuestionAnsweringPipeline(Pipeline):
             # Mask CLS
             start_[0] = end_[0] = 0
 
-            starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
+            starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
             char_to_word = np.array(example.char_to_word_offset)
 
             # Convert the answer (tokens) back to the original text
             answers += [
                 {
-                    'score': score.item(),
-                    'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                    'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                    'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1])
+                    "score": score.item(),
+                    "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                    "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                    "answer": " ".join(
+                        example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                    ),
                 }
                 for s, e, score in zip(starts, ends, scores)
             ]
@@ -767,71 +831,71 @@ class QuestionAnsweringPipeline(Pipeline):
             chars_idx += len(word) + 1
 
         # Join text with spaces
-        return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
+        return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)}
 
 
 # Register all the supported task here
 SUPPORTED_TASKS = {
-    'feature-extraction': {
-        'impl': FeatureExtractionPipeline,
-        'tf': TFAutoModel if is_tf_available() else None,
-        'pt': AutoModel if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'distilbert-base-uncased',
-                'tf': 'distilbert-base-uncased',
-            },
-            'config': None,
-            'tokenizer': 'distilbert-base-uncased'
-        }
+    "feature-extraction": {
+        "impl": FeatureExtractionPipeline,
+        "tf": TFAutoModel if is_tf_available() else None,
+        "pt": AutoModel if is_torch_available() else None,
+        "default": {
+            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased",},
+            "config": None,
+            "tokenizer": "distilbert-base-uncased",
+        },
     },
-    'sentiment-analysis': {
-        'impl': TextClassificationPipeline,
-        'tf': TFAutoModelForSequenceClassification if is_tf_available() else None,
-        'pt': AutoModelForSequenceClassification if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
-                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+    "sentiment-analysis": {
+        "impl": TextClassificationPipeline,
+        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
+        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
+                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
             },
-            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json',
-            'tokenizer': 'distilbert-base-uncased'
-        }
+            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
+            "tokenizer": "distilbert-base-uncased",
+        },
     },
-    'ner': {
-        'impl': NerPipeline,
-        'tf': TFAutoModelForTokenClassification if is_tf_available() else None,
-        'pt': AutoModelForTokenClassification if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
-                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+    "ner": {
+        "impl": NerPipeline,
+        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
+        "pt": AutoModelForTokenClassification if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin",
+                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5",
             },
-            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json',
-            'tokenizer': 'bert-large-cased'
-        }
+            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
+            "tokenizer": "bert-large-cased",
+        },
     },
-    'question-answering': {
-        'impl': QuestionAnsweringPipeline,
-        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
-        'pt': AutoModelForQuestionAnswering if is_torch_available() else None,
-        'default': {
-            'model': {
-                'pt': 'distilbert-base-uncased-distilled-squad',
-                'tf': 'distilbert-base-uncased-distilled-squad',
+    "question-answering": {
+        "impl": QuestionAnsweringPipeline,
+        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
+        "default": {
+            "model": {
+                "pt": "distilbert-base-uncased-distilled-squad",
+                "tf": "distilbert-base-uncased-distilled-squad",
             },
-            'config': None,
-            'tokenizer': 'distilbert-base-uncased'
-        }
-    }
+            "config": None,
+            "tokenizer": "distilbert-base-uncased",
+        },
+    },
 }
 
 
-def pipeline(task: str, model: Optional = None,
-             config: Optional[Union[str, PretrainedConfig]] = None,
-             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-             modelcard: Optional[Union[str, ModelCard]] = None,
-             **kwargs) -> Pipeline:
+def pipeline(
+    task: str,
+    model: Optional = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+    modelcard: Optional[Union[str, ModelCard]] = None,
+    **kwargs
+) -> Pipeline:
     """
     Utility factory method to build a pipeline.
     Pipeline are made of:
@@ -852,11 +916,11 @@ def pipeline(task: str, model: Optional = None,
     framework = get_framework(model)
 
     targeted_task = SUPPORTED_TASKS[task]
-    task, model_class = targeted_task['impl'], targeted_task[framework]
+    task, model_class = targeted_task["impl"], targeted_task[framework]
 
     # Use default model/config/tokenizer for the task if no model is provided
     if model is None:
-        models, config, tokenizer = tuple(targeted_task['default'].values())
+        models, config, tokenizer = tuple(targeted_task["default"].values())
         model = models[framework]
 
     # Try to infer tokenizer from model or config name (if provided as str)
@@ -867,8 +931,10 @@ def pipeline(task: str, model: Optional = None,
             tokenizer = config
         else:
             # Impossible to guest what is the right tokenizer here
-            raise Exception("Impossible to guess which tokenizer to use. "
-                            "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.")
+            raise Exception(
+                "Impossible to guess which tokenizer to use. "
+                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
+            )
 
     # Try to infer modelcard from model or config name (if provided as str)
     if modelcard is None:
@@ -894,14 +960,18 @@ def pipeline(task: str, model: Optional = None,
     if isinstance(model, str):
         # Handle transparent TF/PT model conversion
         model_kwargs = {}
-        if framework == 'pt' and model.endswith('.h5'):
-            model_kwargs['from_tf'] = True
-            logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. '
-                           'Trying to load the model with PyTorch.')
-        elif framework == 'tf' and model.endswith('.bin'):
-            model_kwargs['from_pt'] = True
-            logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. '
-                           'Trying to load the model with Tensorflow.')
+        if framework == "pt" and model.endswith(".h5"):
+            model_kwargs["from_tf"] = True
+            logger.warning(
+                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+                "Trying to load the model with PyTorch."
+            )
+        elif framework == "tf" and model.endswith(".bin"):
+            model_kwargs["from_pt"] = True
+            logger.warning(
+                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+                "Trying to load the model with Tensorflow."
+            )
         model = model_class.from_pretrained(model, config=config, **model_kwargs)
 
     return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index 376d110d3..d109a655f 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -32,10 +32,10 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'vocab_size'))
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+        self.parent.assertTrue(hasattr(config, "vocab_size"))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
 
     def create_and_test_config_to_json_string(self):
         config = self.config_class(**self.inputs_dict)
@@ -68,5 +68,6 @@ class ConfigTester(object):
         self.create_and_test_config_to_json_file()
         self.create_and_test_config_from_and_save_pretrained()
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index b45f5acee..71963df10 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -28,20 +28,15 @@ PASS = "__DUMMY_TRANSFORMERS_PASS__"
 FILES = [
     (
         "Test-{}.txt".format(int(time.time())),
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
-        )
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"),
     ),
     (
-        "yoyo {}.txt".format(int(time.time())), # space is intentional
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
-        )
+        "yoyo {}.txt".format(int(time.time())),  # space is intentional
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"),
     ),
 ]
 
 
-
 class HfApiCommonTest(unittest.TestCase):
     _api = HfApi(endpoint="https://moon-staging.huggingface.co")
 
@@ -76,11 +71,9 @@ class HfApiEndpointsTest(HfApiCommonTest):
 
     def test_presign_and_upload(self):
         for FILE_KEY, FILE_PATH in FILES:
-            access_url = self._api.presign_and_upload(
-                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-            )
+            access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
             self.assertIsInstance(access_url, six.string_types)
-            with open(FILE_PATH, 'r') as f:
+            with open(FILE_PATH, "r") as f:
                 body = f.read()
             r = requests.get(access_url)
             self.assertEqual(r.text, body)
@@ -93,7 +86,6 @@ class HfApiEndpointsTest(HfApiCommonTest):
             self.assertIsInstance(o, S3Obj)
 
 
-
 class HfFolderTest(unittest.TestCase):
     def test_token_workflow(self):
         """
@@ -102,18 +94,12 @@ class HfFolderTest(unittest.TestCase):
         """
         token = "token-{}".format(int(time.time()))
         HfFolder.save_token(token)
-        self.assertEqual(
-            HfFolder.get_token(),
-            token
-        )
+        self.assertEqual(HfFolder.get_token(), token)
         HfFolder.delete_token()
         HfFolder.delete_token()
         # ^^ not an error, we test that the
         # second call does not fail.
-        self.assertEqual(
-            HfFolder.get_token(),
-            None
-        )
+        self.assertEqual(HfFolder.get_token(), None)
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
index b293b5726..30fe33a90 100644
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -21,44 +21,39 @@ import unittest
 from transformers.modelcard import ModelCard
 from .tokenization_tests_commons import TemporaryDirectory
 
-class ModelCardTester(unittest.TestCase):
 
+class ModelCardTester(unittest.TestCase):
     def setUp(self):
-        self.inputs_dict = {'model_details': {
-                                'Organization': 'testing',
-                                'Model date': 'today',
-                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
-                                'Architecture': 'Convolutional Neural Network.',
-                                },
-                            'metrics': 'BLEU and ROUGE-1',
-                            'evaluation_data':{
-                                'Datasets':{
-                                    'BLEU': 'My-great-dataset-v1',
-                                    'ROUGE-1': 'My-short-dataset-v2.1',
-                                },
-                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
-                            },
-                            'training_data':{
-                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
-                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
-                            },
-                            'quantitative_analyses': {
-                                'BLEU': 55.1,
-                                'ROUGE-1': 76,
-                            },
-                            }
+        self.inputs_dict = {
+            "model_details": {
+                "Organization": "testing",
+                "Model date": "today",
+                "Model version": "v2.1, Developed by Test Corp in 2019.",
+                "Architecture": "Convolutional Neural Network.",
+            },
+            "metrics": "BLEU and ROUGE-1",
+            "evaluation_data": {
+                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1",},
+                "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "training_data": {
+                "Dataset": "English Wikipedia dump dated 2018-12-01",
+                "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76,},
+        }
 
     def test_model_card_common_properties(self):
         modelcard = ModelCard.from_dict(self.inputs_dict)
-        self.assertTrue(hasattr(modelcard, 'model_details'))
-        self.assertTrue(hasattr(modelcard, 'intended_use'))
-        self.assertTrue(hasattr(modelcard, 'factors'))
-        self.assertTrue(hasattr(modelcard, 'metrics'))
-        self.assertTrue(hasattr(modelcard, 'evaluation_data'))
-        self.assertTrue(hasattr(modelcard, 'training_data'))
-        self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
-        self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
-        self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
+        self.assertTrue(hasattr(modelcard, "model_details"))
+        self.assertTrue(hasattr(modelcard, "intended_use"))
+        self.assertTrue(hasattr(modelcard, "factors"))
+        self.assertTrue(hasattr(modelcard, "metrics"))
+        self.assertTrue(hasattr(modelcard, "evaluation_data"))
+        self.assertTrue(hasattr(modelcard, "training_data"))
+        self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
+        self.assertTrue(hasattr(modelcard, "ethical_considerations"))
+        self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
 
     def test_model_card_to_json_string(self):
         modelcard = ModelCard.from_dict(self.inputs_dict)
@@ -70,7 +65,7 @@ class ModelCardTester(unittest.TestCase):
         model_card_first = ModelCard.from_dict(self.inputs_dict)
 
         with TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, u"modelcard.json")
+            filename = os.path.join(tmpdirname, "modelcard.json")
             model_card_first.to_json_file(filename)
             model_card_second = ModelCard.from_json_file(filename)
 
@@ -85,5 +80,6 @@ class ModelCardTester(unittest.TestCase):
 
         self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index b726fd927..f798af95b 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -20,14 +20,18 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
-                              AlbertForSequenceClassification, AlbertForQuestionAnswering,
-                              )
+    from transformers import (
+        AlbertConfig,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+    )
     from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -37,33 +41,33 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
 
     class AlbertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     embedding_size=16,
-                     hidden_size=36,
-                     num_hidden_layers=6,
-                     num_hidden_groups=6,
-                     num_attention_heads=6,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            embedding_size=16,
+            hidden_size=36,
+            num_hidden_layers=6,
+            num_hidden_groups=6,
+            num_attention_heads=6,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -120,16 +124,17 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
                 initializer_range=self.initializer_range,
-                num_hidden_groups=self.num_hidden_groups)
+                num_hidden_groups=self.num_hidden_groups,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -142,66 +147,79 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = AlbertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = AlbertForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -233,5 +251,6 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
             model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 871a262fe..3bdaa8a37 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -25,14 +25,21 @@ from transformers import is_torch_available
 from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 
 if is_torch_available():
-    from transformers import (AutoConfig, BertConfig,
-                                    AutoModel, BertModel,
-                                    AutoModelWithLMHead, BertForMaskedLM,
-                                    AutoModelForSequenceClassification, BertForSequenceClassification,
-                                    AutoModelForQuestionAnswering, BertForQuestionAnswering)
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        AutoModel,
+        BertModel,
+        AutoModelWithLMHead,
+        BertForMaskedLM,
+        AutoModelForSequenceClassification,
+        BertForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        BertForQuestionAnswering,
+    )
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .modeling_common_test import CommonTestCases, ids_tensor
     from .configuration_common_test import ConfigTester
 
 
@@ -75,7 +82,9 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsInstance(config, BertConfig)
 
             model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(
+                model_name, output_loading_info=True
+            )
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForSequenceClassification)
 
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index a5adff8f6..6711aded6 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -20,51 +20,68 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (BertConfig, BertModel, BertForMaskedLM,
-                              BertForNextSentencePrediction, BertForPreTraining,
-                              BertForQuestionAnswering, BertForSequenceClassification,
-                              BertForTokenClassification, BertForMultipleChoice)
+    from transformers import (
+        BertConfig,
+        BertModel,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForMultipleChoice,
+    )
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                         BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                         BertForTokenClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            BertModel,
+            BertForMaskedLM,
+            BertForNextSentencePrediction,
+            BertForPreTraining,
+            BertForQuestionAnswering,
+            BertForSequenceClassification,
+            BertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     class BertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -119,25 +136,44 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
                 is_decoder=False,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def prepare_config_and_inputs_for_decoder(self):
-            config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = self.prepare_config_and_inputs()
 
             config.is_decoder = True
             encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
             encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+                encoder_hidden_states,
+                encoder_attention_mask,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -150,16 +186,38 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
             model = BertModel(config)
             model.to(torch_device)
             model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
+            sequence_output, pooled_output = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            sequence_output, pooled_output = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+            )
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
 
             result = {
@@ -167,122 +225,171 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_for_masked_lm_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
             model = BertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
+            loss, prediction_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            loss, prediction_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForNextSentencePrediction(config=config)
             model.to(torch_device)
             model.eval()
-            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
+            loss, seq_relationship_score = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                next_sentence_label=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "seq_relationship_score": seq_relationship_score,
             }
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForPreTraining(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
+            loss, prediction_scores, seq_relationship_score = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                next_sentence_label=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
                 "seq_relationship_score": seq_relationship_score,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = BertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_choices = self.num_choices
             model = BertForMultipleChoice(config=config)
             model.to(torch_device)
@@ -290,24 +397,31 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(multiple_choice_inputs_ids,
-                                 attention_mask=multiple_choice_input_mask,
-                                 token_type_ids=multiple_choice_token_type_ids,
-                                 labels=choice_labels)
+            loss, logits = model(
+                multiple_choice_inputs_ids,
+                attention_mask=multiple_choice_input_mask,
+                token_type_ids=multiple_choice_token_type_ids,
+                labels=choice_labels,
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_choices])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 2116651f4..6834c78d1 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -36,34 +36,48 @@ if is_torch_available():
     import torch
     import numpy as np
 
-    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
-                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (
+        AdaptiveEmbedding,
+        PretrainedConfig,
+        PreTrainedModel,
+        BertModel,
+        BertConfig,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2Config,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key or 'initializer_factor' in key:
+        if "_range" in key or "_std" in key or "initializer_factor" in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
-class CommonTestCases:
 
+class CommonTestCases:
     @require_torch
     class CommonModelTester(unittest.TestCase):
 
@@ -108,8 +122,11 @@ class CommonTestCases:
                 model = model_class(config=configs_no_init)
                 for name, param in model.named_parameters():
                     if param.requires_grad:
-                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
-                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+                        self.assertIn(
+                            param.data.mean().item(),
+                            [0.0, 1.0],
+                            msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
+                        )
 
         def test_determinism(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -131,10 +148,22 @@ class CommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+            decoder_seq_length = (
+                self.model_tester.decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
 
             for model_class in self.all_model_classes:
                 config.output_attentions = True
@@ -150,23 +179,20 @@ class CommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length ,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
                     self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                     self.assertEqual(model.config.output_attentions, True)
                     self.assertEqual(model.config.output_hidden_states, False)
                     self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
-                         decoder_seq_length,
-                         decoder_key_length
-                         ])
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                    )
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -184,9 +210,8 @@ class CommonTestCases:
                 self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
 
         def test_torchscript(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -215,7 +240,7 @@ class CommonTestCases:
                 model = model_class(config=configs_no_init)
                 model.to(torch_device)
                 model.eval()
-                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+                inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
 
                 try:
                     traced_gpt2 = torch.jit.trace(model, inputs)
@@ -269,12 +294,14 @@ class CommonTestCases:
 
                 # Prepare head_mask
                 # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
+                head_mask = torch.ones(
+                    self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+                )
                 head_mask[0, 0] = 0
                 head_mask[-1, :-1] = 0
                 head_mask.requires_grad_(requires_grad=True)
                 inputs = inputs_dict.copy()
-                inputs['head_mask'] = head_mask
+                inputs["head_mask"] = head_mask
 
                 outputs = model(**inputs)
 
@@ -289,21 +316,20 @@ class CommonTestCases:
 
                 # Remove Nan
                 for t in attentions:
-                    self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4)  # Check we don't have more than 25% nans (arbitrary)
-                attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions]  # remove them (the test is less complete)
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
 
                 self.assertIsNotNone(multihead_outputs)
                 self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-                self.assertAlmostEqual(
-                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(
-                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
         def test_head_pruning(self):
             if not self.test_pruning:
@@ -320,20 +346,16 @@ class CommonTestCases:
                 model = model_class(config=config)
                 model.to(torch_device)
                 model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 model.prune_heads(heads_to_prune)
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
 
                 attentions = outputs[-1]
 
-                self.assertEqual(
-                    attentions[0].shape[-3], 1)
-                self.assertEqual(
-                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
-                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
         def test_head_pruning_save_load_from_pretrained(self):
             if not self.test_pruning:
@@ -350,8 +372,7 @@ class CommonTestCases:
                 model = model_class(config=config)
                 model.to(torch_device)
                 model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 model.prune_heads(heads_to_prune)
 
                 with TemporaryDirectory() as temp_dir_name:
@@ -366,7 +387,6 @@ class CommonTestCases:
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                 self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
-
         def test_head_pruning_save_load_from_config_init(self):
             if not self.test_pruning:
                 return
@@ -380,8 +400,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
 
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-                                 -1: [0]}
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
@@ -446,7 +465,7 @@ class CommonTestCases:
                     outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
                 self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
                 self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
@@ -470,8 +489,13 @@ class CommonTestCases:
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
-                     self.model_tester.hidden_size])
+                    [
+                        self.model_tester.encoder_seq_length
+                        if hasattr(self.model_tester, "encoder_seq_length")
+                        else self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
 
         def test_resize_tokens_embeddings(self):
             original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -512,15 +536,10 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
-                self.assertIsInstance(
-                    model.get_input_embeddings(),
-                    (torch.nn.Embedding, AdaptiveEmbedding)
-                )
+                self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
                 model.set_input_embeddings(torch.nn.Embedding(10, 10))
                 x = model.get_output_embeddings()
-                self.assertTrue(
-                    x is None or isinstance(x, torch.nn.Linear)
-                )
+                self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
 
         def test_tie_model_weights(self):
             if not self.test_torchscript:
@@ -602,30 +621,30 @@ class CommonTestCases:
                     outputs = model(**inputs_dict)
 
     class GPTModelTester(CommonModelTester):
-
-        def __init__(self,
-                        parent,
-                        batch_size=13,
-                        seq_length=7,
-                        is_training=True,
-                        use_position_ids=True,
-                        use_token_type_ids=True,
-                        use_labels=True,
-                        vocab_size=99,
-                        n_positions=33,
-                        hidden_size=32,
-                        num_hidden_layers=5,
-                        num_attention_heads=4,
-                        n_choices=3,
-                        type_sequence_label_size=2,
-                        initializer_range=0.02,
-                        num_labels=3,
-                        scope=None,
-                        config_class=None,
-                        base_model_class=None,
-                        lm_head_model_class=None,
-                        double_head_model_class=None,
-                        ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_position_ids=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=33,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            n_choices=3,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            scope=None,
+            config_class=None,
+            base_model_class=None,
+            lm_head_model_class=None,
+            double_head_model_class=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -676,13 +695,14 @@ class CommonTestCases:
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
+            return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
 
-        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_base_model(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.base_model_class(config)
             model.to(torch_device)
             model.eval()
@@ -694,12 +714,12 @@ class CommonTestCases:
 
             hidden_state = outputs[0]
             self.parent.assertListEqual(
-                list(hidden_state.size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+                list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]
+            )
 
-
-        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_lm_head(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.lm_head_model_class(config)
             model.to(torch_device)
             model.eval()
@@ -709,14 +729,13 @@ class CommonTestCases:
 
             total_voc = self.vocab_size
             self.parent.assertListEqual(
-                list(lm_logits.size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(loss.size()),
-                [])
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+            )
+            self.parent.assertListEqual(list(loss.size()), [])
 
-        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_presents(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             for model_class in self.all_model_classes:
                 model = model_class(config)
                 model.to(torch_device)
@@ -727,30 +746,39 @@ class CommonTestCases:
                 self.parent.assertEqual(self.num_hidden_layers, len(presents))
                 self.parent.assertListEqual(
                     list(presents[0].size()),
-                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
-                        self.seq_length, self.hidden_size // self.num_attention_heads])
+                    [
+                        2,
+                        self.batch_size * self.n_choices,
+                        self.num_attention_heads,
+                        self.seq_length,
+                        self.hidden_size // self.num_attention_heads,
+                    ],
+                )
 
-        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                        mc_labels, lm_labels, mc_token_ids):
+        def create_and_check_double_heads(
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
             model = self.double_head_model_class(config)
             model.to(torch_device)
             model.eval()
             with torch.no_grad():
-                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                            token_type_ids=token_type_ids, position_ids=position_ids)
+                outputs = model(
+                    input_ids,
+                    mc_token_ids,
+                    lm_labels=lm_labels,
+                    mc_labels=mc_labels,
+                    token_type_ids=token_type_ids,
+                    position_ids=position_ids,
+                )
             lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
             loss = [lm_loss, mc_loss]
 
             total_voc = self.vocab_size
             self.parent.assertListEqual(
-                list(lm_logits.size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(mc_logits.size()),
-                [self.batch_size, self.n_choices])
-            self.parent.assertListEqual(
-                [list(l.size()) for l in loss],
-                [[], []])
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
+            )
+            self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices])
+            self.parent.assertListEqual([list(l.size()) for l in loss], [[], []])
 
         def create_and_check_model_from_pretrained(self):
             for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
@@ -759,9 +787,8 @@ class CommonTestCases:
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, position_ids,
-                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids}
+            (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids}
             return config, inputs_dict
 
         def run_common_tests(self, test_presents=False):
@@ -791,10 +818,10 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'vocab_size'))
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+        self.parent.assertTrue(hasattr(config, "vocab_size"))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
 
     def create_and_test_config_to_json_string(self):
         config = self.config_class(**self.inputs_dict)
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index ed0d62d1e..9b71b1dd5 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -21,10 +21,9 @@ import pdb
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    CTRLLMHeadModel)
+    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -39,32 +38,32 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
     test_head_masking = False
 
     class CTRLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -129,12 +128,20 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLModel(config=config)
@@ -150,8 +157,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                 "presents": presents,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -161,29 +168,28 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index ac6f5d248..5b4f4683d 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -21,11 +21,16 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
-                                    DistilBertForTokenClassification,
-                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from transformers import (
+        DistilBertConfig,
+        DistilBertModel,
+        DistilBertForMaskedLM,
+        DistilBertForTokenClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+    )
+
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -33,39 +38,42 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
-                         DistilBertForSequenceClassification) if is_torch_available() else None
+    all_model_classes = (
+        (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+        if is_torch_available()
+        else None
+    )
     test_pruning = True
     test_torchscript = True
     test_resize_embeddings = True
     test_head_masking = True
 
     class DistilBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=False,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -114,16 +122,17 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 dropout=self.hidden_dropout_prob,
                 attention_dropout=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -134,10 +143,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "sequence_output": sequence_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
@@ -147,29 +158,31 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = DistilBertForQuestionAnswering(config=config)
             model.to(torch_device)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+            )
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = DistilBertForSequenceClassification(config)
             model.to(torch_device)
@@ -179,12 +192,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "loss": loss,
                 "logits": logits,
             }
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-        def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_token_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = DistilBertForTokenClassification(config=config)
             model.to(torch_device)
@@ -196,14 +209,14 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -239,5 +252,6 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
index 64e86df8f..491c502ba 100644
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -39,13 +39,13 @@ class EncoderDecoderModelTest(unittest.TestCase):
     def test_model2model_from_pretrained_not_bert(self):
         logging.basicConfig(level=logging.INFO)
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('roberta')
+            _ = Model2Model.from_pretrained("roberta")
 
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('distilbert')
+            _ = Model2Model.from_pretrained("distilbert")
 
         with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('does-not-exist')
+            _ = Model2Model.from_pretrained("does-not-exist")
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index ad2ec1fd9..2706166b3 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -21,10 +21,15 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
-
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from transformers import (
+        GPT2Config,
+        GPT2Model,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+    )
+
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -35,32 +40,32 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
 
     class GPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -125,12 +130,20 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2Model(config=config)
@@ -146,8 +159,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 "presents": presents,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -157,63 +170,58 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_double_lm_head_model(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = GPT2DoubleHeadsModel(config)
             model.to(torch_device)
             model.eval()
 
-
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids,
-                      'lm_labels': multiple_choice_inputs_ids}
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
+                "lm_labels": multiple_choice_inputs_ids,
+            }
 
             loss, lm_logits, mc_logits, _ = model(**inputs)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 1880febca..f22a0b760 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -21,10 +21,15 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from transformers import (
+        OpenAIGPTConfig,
+        OpenAIGPTModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+    )
+
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -32,33 +37,35 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    all_model_classes = (
+        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    )
 
     class OpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,9 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
             return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
         def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTModel(config=config)
@@ -129,12 +134,10 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
             model(input_ids, token_type_ids=token_type_ids)
             (sequence_output,) = model(input_ids)
 
-            result = {
-                "sequence_output": sequence_output
-            }
+            result = {"sequence_output": sequence_output}
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTLMHeadModel(config)
@@ -143,17 +146,12 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTDoubleHeadsModel(config)
@@ -162,26 +160,25 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
             loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
 
-            result = {
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            result = {"loss": loss, "lm_logits": lm_logits}
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'head_mask': head_mask
-            }
+            (
+                config,
+                input_ids,
+                head_mask,
+                token_type_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
 
             return config, inputs_dict
 
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 732e589cd..451dafe08 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -22,12 +22,17 @@ from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
-                              RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers import (
+        RobertaConfig,
+        RobertaModel,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        RobertaForTokenClassification,
+    )
     from transformers.modeling_roberta import RobertaEmbeddings
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -38,31 +43,31 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
     all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
 
     class RobertaModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,17 +121,17 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+            self.parent.assertListEqual(list(result["loss"].size()), [])
 
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                           token_labels, choice_labels):
+        def create_and_check_roberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = RobertaModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -139,47 +144,59 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 "pooled_output": pooled_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                                   token_labels, choice_labels):
+        def create_and_check_roberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = RobertaForMaskedLM(config=config)
             model.to(torch_device)
             model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
-                                                              sequence_labels, token_labels, choice_labels):
+        def create_and_check_roberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = RobertaForTokenClassification(config=config)
             model.to(torch_device)
             model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
-                                 labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
             result = {
                 "loss": loss,
                 "logits": logits,
             }
             self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -214,18 +231,12 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         model = RobertaEmbeddings(config=config)
 
         input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor([[
-            0 + model.padding_idx + 1,
-            1 + model.padding_idx + 1,
-            2 + model.padding_idx + 1,
-            model.padding_idx
-        ]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
 
         position_ids = model.create_position_ids_from_input_ids(input_ids)
-        self.assertEqual(
-            position_ids.shape,
-            expected_positions.shape
-        )
+        self.assertEqual(position_ids.shape, expected_positions.shape)
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
     def test_create_position_ids_from_inputs_embeds(self):
@@ -247,69 +258,47 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         ]
         expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
         position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(
-            position_ids.shape,
-            expected_positions.shape
-        )
-        self.assertTrue(
-            torch.all(torch.eq(position_ids, expected_positions))
-        )
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
 
 class RobertaModelIntegrationTest(unittest.TestCase):
-
     @slow
     def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        model = RobertaForMaskedLM.from_pretrained("roberta-base")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(
-            output.shape,
-            expected_shape
-        )
+        self.assertEqual(output.shape, expected_shape)
         # compare the actual values for a slice.
         expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779],
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
         )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
 
     @slow
     def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained('roberta-base')
+        model = RobertaModel.from_pretrained("roberta-base")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = torch.Tensor(
-            [[[-0.0231,  0.0782,  0.0074],
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
         )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
 
     @slow
     def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
 
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 3))
-        self.assertEqual(
-            output.shape,
-            expected_shape
-        )
-        expected_tensor = torch.Tensor([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            torch.allclose(output, expected_tensor, atol=1e-3)
-        )
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 9fd9a4b30..3feb61a62 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -20,12 +20,12 @@ import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
-    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers import T5Config, T5Model, T5WithLMHeadModel
     from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
@@ -39,26 +39,26 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
     is_encoder_decoder = True
 
     class T5ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     encoder_seq_length=7,
-                     decoder_seq_length=9,
-                     is_training=True,
-                     use_attention_mask=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_positions=14,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     d_ff=37,
-                     relative_attention_num_buckets=8,
-                     dropout_rate=0.1,
-                     initializer_factor=0.002,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            encoder_seq_length=7,
+            decoder_seq_length=9,
+            is_training=True,
+            use_attention_mask=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=14,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            d_ff=37,
+            relative_attention_num_buckets=8,
+            dropout_rate=0.1,
+            initializer_factor=0.002,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.encoder_seq_length = encoder_seq_length
@@ -101,60 +101,96 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                 num_heads=self.num_attention_heads,
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor)
-
-            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+                initializer_factor=self.initializer_factor,
+            )
+
+            return (
+                config,
+                encoder_input_ids,
+                decoder_input_ids,
+                encoder_attention_mask,
+                decoder_attention_mask,
+                decoder_lm_labels,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            self.parent.assertListEqual(list(result["loss"].size()), [])
+
+        def create_and_check_t5_model(
+            self,
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
             model = T5Model(config=config)
             model.eval()
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
-                                                   decoder_input_ids=decoder_input_ids,
-                                                   encoder_attention_mask=encoder_attention_mask,
-                                                   decoder_attention_mask=decoder_attention_mask)
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
-                                                   decoder_input_ids=decoder_input_ids)
+            decoder_output, encoder_output = model(
+                encoder_input_ids=encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                encoder_attention_mask=encoder_attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+            decoder_output, encoder_output = model(
+                encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids
+            )
 
             result = {
                 "encoder_output": encoder_output,
                 "decoder_output": decoder_output,
             }
             self.parent.assertListEqual(
-                list(result["encoder_output"].size()),
-                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["decoder_output"].size()),
-                [self.batch_size, self.decoder_seq_length, self.hidden_size])
-
-
-        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
+            )
+
+        def create_and_check_t5_with_lm_head(
+            self,
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
             model = T5WithLMHeadModel(config=config)
             model.eval()
-            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
-                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            outputs = model(
+                encoder_input_ids=encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                decoder_lm_labels=decoder_lm_labels,
+            )
             loss, prediction_scores = outputs[0], outputs[1]
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
+            )
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
-             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
-            inputs_dict = {'encoder_input_ids': encoder_input_ids,
-                           'decoder_input_ids': decoder_input_ids,
-                           'decoder_attention_mask': decoder_attention_mask,
-                           'encoder_attention_mask': encoder_attention_mask}
+            (
+                config,
+                encoder_input_ids,
+                decoder_input_ids,
+                encoder_attention_mask,
+                decoder_attention_mask,
+                decoder_lm_labels,
+            ) = config_and_inputs
+            inputs_dict = {
+                "encoder_input_ids": encoder_input_ids,
+                "decoder_input_ids": decoder_input_ids,
+                "decoder_attention_mask": decoder_attention_mask,
+                "encoder_attention_mask": encoder_attention_mask,
+            }
             return config, inputs_dict
 
     def setUp(self):
@@ -178,5 +214,6 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 374417cfe..0406592d5 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,47 +27,48 @@ from transformers import AlbertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
-                                                 TFAlbertForSequenceClassification,
-                                                 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_albert import (
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (
-        TFAlbertModel,
-        TFAlbertForMaskedLM,
-        TFAlbertForSequenceClassification
-    ) if is_tf_available() else ()
+        (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else ()
+    )
 
     class TFAlbertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     embedding_size=16,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            embedding_size=16,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -93,27 +94,22 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.vocab_size)
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             input_mask = None
             if self.use_input_mask:
-                input_mask = ids_tensor(
-                    [self.batch_size, self.seq_length], vocab_size=2)
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                token_type_ids = ids_tensor(
-                    [self.batch_size, self.seq_length], self.type_vocab_size)
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             sequence_labels = None
             token_labels = None
             choice_labels = None
             if self.use_labels:
-                sequence_labels = ids_tensor(
-                    [self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor(
-                    [self.batch_size, self.seq_length], self.num_labels)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
@@ -127,19 +123,20 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFAlbertModel(config=config)
             # inputs = {'input_ids': input_ids,
             #           'attention_mask': input_mask,
             #           'token_type_ids': token_type_ids}
             # sequence_output, pooled_output = model(**inputs)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -152,50 +149,52 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [
-                                        self.batch_size, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFAlbertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFAlbertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids,
-                           'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
         self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AlbertConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -206,13 +205,11 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(
-            *config_and_inputs)
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
 
     def test_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(
-            *config_and_inputs)
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 2ad39ddcc..d695474ec 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -25,14 +25,21 @@ from transformers import is_tf_available
 from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 
 if is_tf_available():
-    from transformers import (AutoConfig, BertConfig,
-                                      TFAutoModel, TFBertModel,
-                                      TFAutoModelWithLMHead, TFBertForMaskedLM,
-                                      TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
-                                      TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        TFAutoModel,
+        TFBertModel,
+        TFAutoModelWithLMHead,
+        TFBertForMaskedLM,
+        TFAutoModelForSequenceClassification,
+        TFBertForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFBertForQuestionAnswering,
+    )
     from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .modeling_common_test import CommonTestCases, ids_tensor
     from .configuration_common_test import ConfigTester
 
 
@@ -41,11 +48,12 @@ class TFAutoModelTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
         import h5py
+
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
 
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -58,7 +66,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -71,7 +79,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -84,7 +92,7 @@ class TFAutoModelTest(unittest.TestCase):
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index abf20b151..e36e3a2c3 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,49 +27,62 @@ from transformers import BertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
-                                                       TFBertForNextSentencePrediction,
-                                                       TFBertForPreTraining,
-                                                       TFBertForSequenceClassification,
-                                                       TFBertForMultipleChoice,
-                                                       TFBertForTokenClassification,
-                                                       TFBertForQuestionAnswering,
-                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_bert import (
+        TFBertModel,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForPreTraining,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
-                         TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification,
-                         TFBertForTokenClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFBertModel,
+            TFBertForMaskedLM,
+            TFBertForNextSentencePrediction,
+            TFBertForPreTraining,
+            TFBertForQuestionAnswering,
+            TFBertForSequenceClassification,
+            TFBertForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
 
     class TFBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -123,15 +136,16 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output, pooled_output = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -144,128 +158,119 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "pooled_output": pooled_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
-
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForNextSentencePrediction(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            seq_relationship_score, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (seq_relationship_score,) = model(inputs)
             result = {
                 "seq_relationship_score": seq_relationship_score.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
-
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
 
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForPreTraining(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores, seq_relationship_score = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
                 "seq_relationship_score": seq_relationship_score.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
 
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFBertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
-
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_choices = self.num_choices
             model = TFBertForMultipleChoice(config=config)
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            logits, = model(inputs)
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
+            }
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_choices])
-
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
 
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFBertForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
-
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -310,10 +315,10 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
     @slow
     def test_model_from_pretrained(self):
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
             model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 5a5873e81..d65e270ae 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -35,6 +35,7 @@ if is_tf_available():
     import tensorflow as tf
     import numpy as np
     from transformers import TFPreTrainedModel
+
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 if sys.version_info[0] == 2:
@@ -42,25 +43,31 @@ if sys.version_info[0] == 2:
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if "_range" in key or "_std" in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
-class TFCommonTestCases:
 
+class TFCommonTestCases:
     @require_tf
     class TFCommonModelTester(unittest.TestCase):
 
@@ -126,8 +133,9 @@ class TFCommonTestCases:
 
                 # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                 pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
-                                      for name, key in inputs_dict.items())
+                pt_inputs_dict = dict(
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict, training=False)
@@ -140,18 +148,19 @@ class TFCommonTestCases:
 
                 # Check we can load pt model in tf and vice-versa with checkpoint => model functions
                 with TemporaryDirectory() as tmpdirname:
-                    pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin')
+                    pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                     torch.save(pt_model.state_dict(), pt_checkpoint_path)
                     tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
 
-                    tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5')
+                    tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                     tf_model.save_weights(tf_checkpoint_path)
                     pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
 
                 # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                 pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
-                                      for name, key in inputs_dict.items())
+                pt_inputs_dict = dict(
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict)
@@ -166,13 +175,19 @@ class TFCommonTestCases:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             if self.is_encoder_decoder:
-                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
-                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+                input_ids = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
+                    ),
+                    "encoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"
+                    ),
+                }
             else:
-                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
             optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+            metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
 
             for model_class in self.all_model_classes:
                 # Prepare our model
@@ -188,7 +203,7 @@ class TFCommonTestCases:
                 hidden_states = outputs_dict[0]
 
                 # Add a dense layer on top to test intetgration with other keras modules
-                outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(hidden_states)
+                outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
 
                 # Compile extended model
                 extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
@@ -202,7 +217,9 @@ class TFCommonTestCases:
                 outputs_dict = model(inputs_dict)
 
                 inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
+                input_ids = inputs_keywords.pop(
+                    "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None
+                )
                 outputs_keywords = model(input_ids, **inputs_keywords)
 
                 output_dict = outputs_dict[0].numpy()
@@ -213,10 +230,22 @@ class TFCommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+            decoder_seq_length = (
+                self.model_tester.decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
 
             for model_class in self.all_model_classes:
                 config.output_attentions = True
@@ -229,22 +258,20 @@ class TFCommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
                     self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                     self.assertEqual(model.config.output_attentions, True)
                     self.assertEqual(model.config.output_hidden_states, False)
                     self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
-                         decoder_seq_length,
-                         decoder_key_length])
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                    )
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -259,9 +286,8 @@ class TFCommonTestCases:
                 self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length])
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
 
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -276,8 +302,8 @@ class TFCommonTestCases:
                 self.assertEqual(model.config.output_hidden_states, True)
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
+                )
 
         def test_model_common_attributes(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -357,9 +383,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    output = tf.constant(values,
-                         shape=shape,
-                         dtype=dtype if dtype is not None else tf.int32)
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
 
     return output
 
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 93b231e51..fb8c4c255 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,8 +27,7 @@ from transformers import CTRLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
-                                                TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_tf
@@ -37,32 +36,32 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
 
     class TFCTRLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -127,13 +126,21 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFCTRLModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -145,30 +152,36 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFCTRLLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -192,6 +205,6 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index f28b5c397..3260f63d5 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import unittest
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -26,48 +26,58 @@ from transformers import DistilBertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
-                                                             TFDistilBertForMaskedLM,
-                                                             TFDistilBertForQuestionAnswering,
-                                                             TFDistilBertForSequenceClassification)
+    from transformers.modeling_tf_distilbert import (
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForQuestionAnswering,
+        TFDistilBertForSequenceClassification,
+    )
 
 
 @require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
-                         TFDistilBertForSequenceClassification) if is_tf_available() else None
+    all_model_classes = (
+        (
+            TFDistilBertModel,
+            TFDistilBertForMaskedLM,
+            TFDistilBertForQuestionAnswering,
+            TFDistilBertForSequenceClassification,
+        )
+        if is_tf_available()
+        else None
+    )
     test_pruning = True
     test_torchscript = True
     test_resize_embeddings = True
     test_head_masking = True
 
     class TFDistilBertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=False,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -116,14 +126,16 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 dropout=self.hidden_dropout_prob,
                 attention_dropout=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
 
             outputs = model(inputs)
             sequence_output = outputs[0]
@@ -136,54 +148,51 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             (prediction_scores,) = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFDistilBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             start_logits, end_logits = model(inputs)
             result = {
                 "start_logits": start_logits.numpy(),
                 "end_logits": end_logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFDistilBertForSequenceClassification(config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
             (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -215,5 +224,6 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 90920342b..09b7eb071 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,45 +27,47 @@ from transformers import GPT2Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
-                                                       TFGPT2DoubleHeadsModel,
-                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_gpt2 import (
+        TFGPT2Model,
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
-                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
     # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
 
     class TFGPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -130,13 +132,21 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2Model(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -148,54 +158,58 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFGPT2LMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_gpt2_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = TFGPT2DoubleHeadsModel(config=config)
 
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
             }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
             self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -223,6 +237,6 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index 065bf2acd..a59395e02 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,44 +27,48 @@ from transformers import OpenAIGPTConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
-                                                         TFOpenAIGPTDoubleHeadsModel,
-                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_openai import (
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
-                         TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    all_model_classes = (
+        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    )
 
     class TFOpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_token_type_ids=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     use_mc_token_ids=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_token_type_ids=True,
+            use_input_mask=True,
+            use_labels=True,
+            use_mc_token_ids=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -129,13 +133,21 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
 
         def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFOpenAIGPTModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, input_mask]
@@ -147,54 +159,58 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = TFOpenAIGPTLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             prediction_scores = model(inputs)[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_openai_gpt_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
             model = TFOpenAIGPTDoubleHeadsModel(config=config)
 
             multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
             multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
             multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
 
-            inputs = {'input_ids': multiple_choice_inputs_ids,
-                      'mc_token_ids': mc_token_ids,
-                      'attention_mask': multiple_choice_input_mask,
-                      'token_type_ids': multiple_choice_token_type_ids}
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "mc_token_ids": mc_token_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
             }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
             self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
+            )
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
 
-            (config, input_ids, input_mask, head_mask, token_type_ids,
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
-
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -222,6 +238,6 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
             model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 93c478ae2..23ea55740 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import unittest
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,44 +27,48 @@ from transformers import RobertaConfig, is_tf_available
 if is_tf_available():
     import tensorflow as tf
     import numpy
-    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
-                                                          TFRobertaForSequenceClassification,
-                                                          TFRobertaForTokenClassification,
-                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_roberta import (
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
-                         TFRobertaForSequenceClassification) if is_tf_available() else ()
+    all_model_classes = (
+        (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else ()
+    )
 
     class TFRobertaModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -118,16 +122,16 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                           token_labels, choice_labels):
+        def create_and_check_roberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFRobertaModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
             sequence_output = model(inputs)[0]
 
             inputs = [input_ids, input_mask]
@@ -139,39 +143,47 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
-                                                   token_labels, choice_labels):
+        def create_and_check_roberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             model = TFRobertaForMaskedLM(config=config)
             prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_roberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
             config.num_labels = self.num_labels
             model = TFRobertaForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
             result = {
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
             return config, inputs_dict
 
     def setUp(self):
@@ -196,61 +208,43 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
             self.assertIsNotNone(model)
 
 
-
 class TFRobertaModelIntegrationTest(unittest.TestCase):
-
     @slow
     def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = [1, 11, 50265]
-        self.assertEqual(
-            list(output.numpy().shape),
-            expected_shape
-        )
+        self.assertEqual(list(output.numpy().shape), expected_shape)
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779],
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
         )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
 
     @slow
     def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained('roberta-base')
+        model = TFRobertaModel.from_pretrained("roberta-base")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
         expected_slice = tf.constant(
-            [[[-0.0231,  0.0782,  0.0074],
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
         )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
 
     @slow
     def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
 
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
         expected_shape = [1, 3]
-        self.assertEqual(
-            list(output.numpy().shape),
-            expected_shape
-        )
-        expected_tensor = tf.constant([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)
-        )
+        self.assertEqual(list(output.numpy().shape), expected_shape)
+        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index da9ce6f89..521085219 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import sys
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,8 +27,7 @@ from transformers import T5Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
-                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @require_tf
@@ -38,25 +37,25 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
     all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
 
     class TFT5ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_positions=14,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     d_ff=37,
-                     relative_attention_num_buckets=8,
-                     dropout_rate=0.1,
-                     initializer_factor=0.002,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_labels=True,
+            vocab_size=99,
+            n_positions=14,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            d_ff=37,
+            relative_attention_num_buckets=8,
+            dropout_rate=0.1,
+            initializer_factor=0.002,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -95,53 +94,58 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
                 num_heads=self.num_attention_heads,
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor)
+                initializer_factor=self.initializer_factor,
+            )
 
             return (config, input_ids, input_mask, token_labels)
 
         def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
             model = TFT5Model(config=config)
-            inputs = {'encoder_input_ids': input_ids,
-                      'decoder_input_ids': input_ids,
-                      'decoder_attention_mask': input_mask}
+            inputs = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             encoder_output, decoder_output = model(inputs)
 
-            encoder_output, decoder_output = model(input_ids,
-                                                   decoder_attention_mask=input_mask,
-                                                   encoder_input_ids=input_ids)
+            encoder_output, decoder_output = model(
+                input_ids, decoder_attention_mask=input_mask, encoder_input_ids=input_ids
+            )
 
             result = {
                 "encoder_output": encoder_output.numpy(),
                 "decoder_output": decoder_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["encoder_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["decoder_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
+                list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
 
         def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
             model = TFT5WithLMHeadModel(config=config)
-            inputs = {'encoder_input_ids': input_ids,
-                      'decoder_input_ids': input_ids,
-                      'decoder_attention_mask': input_mask}
+            inputs = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             prediction_scores, decoder_output = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, token_labels) = config_and_inputs
-            inputs_dict = {'encoder_input_ids': input_ids,
-                           'decoder_input_ids': input_ids,
-                           'decoder_attention_mask': input_mask}
+            inputs_dict = {
+                "encoder_input_ids": input_ids,
+                "decoder_input_ids": input_ids,
+                "decoder_attention_mask": input_mask,
+            }
             return config, inputs_dict
 
     def setUp(self):
@@ -161,9 +165,10 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in ['t5-small']:
+        for model_name in ["t5-small"]:
             model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 8225c0927..20de598d0 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import unittest
 import random
 
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -27,9 +27,11 @@ from transformers import TransfoXLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
-                                                             TFTransfoXLLMHeadModel,
-                                                             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers.modeling_tf_transfo_xl import (
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 
 
 @require_tf
@@ -41,27 +43,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
     test_resize_embeddings = False
 
     class TFTransfoXLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=30,
-                     clamp_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     d_embed=32,
-                     num_attention_heads=4,
-                     d_head=8,
-                     d_inner=128,
-                     div_val=2,
-                     num_hidden_layers=5,
-                     scope=None,
-                     seed=1,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=30,
+            clamp_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            d_embed=32,
+            num_attention_heads=4,
+            d_head=8,
+            d_inner=128,
+            div_val=2,
+            num_hidden_layers=5,
+            scope=None,
+            seed=1,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -101,7 +103,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                 d_head=self.d_head,
                 d_inner=self.d_inner,
                 div_val=self.div_val,
-                n_layer=self.num_hidden_layers)
+                n_layer=self.num_hidden_layers,
+            )
 
             return (config, input_ids_1, input_ids_2, lm_labels)
 
@@ -114,8 +117,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
             hidden_states_1, mems_1 = model(input_ids_1)
 
-            inputs = {'input_ids': input_ids_2,
-                      'mems': mems_1}
+            inputs = {"input_ids": input_ids_2, "mems": mems_1}
 
             hidden_states_2, mems_2 = model(inputs)
 
@@ -127,33 +129,31 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["hidden_states_1"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["hidden_states_2"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TFTransfoXLLMHeadModel(config)
 
             lm_logits_1, mems_1 = model(input_ids_1)
 
-            inputs = {'input_ids': input_ids_1,
-                      'labels': lm_labels}
+            inputs = {"input_ids": input_ids_1, "labels": lm_labels}
             _, mems_1 = model(inputs)
 
             lm_logits_2, mems_2 = model([input_ids_2, mems_1])
 
-            inputs = {'input_ids': input_ids_1,
-                      'mems': mems_1,
-                      'labels': lm_labels}
+            inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
 
             _, mems_2 = model(inputs)
 
@@ -165,26 +165,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["lm_logits_1"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
             self.parent.assertListEqual(
-                list(result["lm_logits_2"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index 065d355b4..9162bf2b3 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -22,13 +22,16 @@ from transformers import is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers import (XLMConfig, TFXLMModel,
-                                      TFXLMWithLMHeadModel,
-                                      TFXLMForSequenceClassification,
-                                      TFXLMForQuestionAnsweringSimple,
-                                      TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+    from transformers import (
+        XLMConfig,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -36,43 +39,44 @@ from .utils import CACHE_DIR, require_tf, slow
 @require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
-                         TFXLMForSequenceClassification,
-                         TFXLMForQuestionAnsweringSimple) if is_tf_available() else ()
-
+    all_model_classes = (
+        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        if is_tf_available()
+        else ()
+    )
 
     class TFXLMModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_lengths=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     gelu_activation=True,
-                     sinusoidal_embeddings=False,
-                     causal=False,
-                     asm=False,
-                     n_langs=2,
-                     vocab_size=99,
-                     n_special=0,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     summary_type="last",
-                     use_proj=True,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_lengths=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            gelu_activation=True,
+            sinusoidal_embeddings=False,
+            causal=False,
+            asm=False,
+            n_langs=2,
+            vocab_size=99,
+            n_special=0,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            summary_type="last",
+            use_proj=True,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -109,7 +113,9 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
             input_lengths = None
             if self.use_input_lengths:
-                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+                input_lengths = (
+                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+                )  # small variation of seq_length
 
             token_type_ids = None
             if self.use_token_type_ids:
@@ -124,30 +130,48 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLMConfig(
-                 vocab_size=self.vocab_size,
-                 n_special=self.n_special,
-                 emb_dim=self.hidden_size,
-                 n_layers=self.num_hidden_layers,
-                 n_heads=self.num_attention_heads,
-                 dropout=self.hidden_dropout_prob,
-                 attention_dropout=self.attention_probs_dropout_prob,
-                 gelu_activation=self.gelu_activation,
-                 sinusoidal_embeddings=self.sinusoidal_embeddings,
-                 asm=self.asm,
-                 causal=self.causal,
-                 n_langs=self.n_langs,
-                 max_position_embeddings=self.max_position_embeddings,
-                 initializer_range=self.initializer_range,
-                 summary_type=self.summary_type,
-                 use_proj=self.use_proj)
-
-            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
-
-        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                vocab_size=self.vocab_size,
+                n_special=self.n_special,
+                emb_dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                gelu_activation=self.gelu_activation,
+                sinusoidal_embeddings=self.sinusoidal_embeddings,
+                asm=self.asm,
+                causal=self.causal,
+                n_langs=self.n_langs,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+                summary_type=self.summary_type,
+                use_proj=self.use_proj,
+            )
+
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            )
+
+        def create_and_check_xlm_model(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths,
-                      'langs': token_type_ids}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
             outputs = model(inputs)
 
             inputs = [input_ids, input_mask]
@@ -157,16 +181,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "sequence_output": sequence_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
-
-        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_xlm_lm_head(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMWithLMHeadModel(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths,
-                      'langs': token_type_ids}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
             outputs = model(inputs)
 
             logits = outputs[0]
@@ -176,15 +207,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
-
-        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+
+        def create_and_check_xlm_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMForQuestionAnsweringSimple(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths}
 
             outputs = model(inputs)
             start_logits, end_logits = model(inputs)
@@ -194,19 +233,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "end_logits": end_logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-
-
-        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
+
+        def create_and_check_xlm_sequence_classif(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = TFXLMForSequenceClassification(config)
 
-            inputs = {'input_ids': input_ids,
-                      'lengths': input_lengths}
+            inputs = {"input_ids": input_ids, "lengths": input_lengths}
 
             (logits,) = model(inputs)
 
@@ -214,16 +257,26 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 "logits": logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.type_sequence_label_size])
-
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_lengths,
-             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'langs': token_type_ids, 'lengths': input_lengths}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            ) = config_and_inputs
+            inputs_dict = {
+                "input_ids": input_ids,
+                "token_type_ids": token_type_ids,
+                "langs": token_type_ids,
+                "lengths": input_lengths,
+            }
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 15fd91748..9a56384a0 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -26,13 +26,16 @@ from transformers import XLNetConfig, is_tf_available
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
-                                                        TFXLNetForSequenceClassification,
-                                                        TFXLNetForTokenClassification,
-                                                        TFXLNetForQuestionAnsweringSimple,
-                                                        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+    from transformers.modeling_tf_xlnet import (
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_tf, slow
 
@@ -40,37 +43,44 @@ from .utils import CACHE_DIR, require_tf, slow
 @require_tf
 class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
-                       TFXLNetForSequenceClassification,
-                       TFXLNetForTokenClassification,
-                       TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
+    all_model_classes = (
+        (
+            TFXLNetModel,
+            TFXLNetLMHeadModel,
+            TFXLNetForSequenceClassification,
+            TFXLNetForTokenClassification,
+            TFXLNetForQuestionAnsweringSimple,
+        )
+        if is_tf_available()
+        else ()
+    )
     test_pruning = False
 
     class TFXLNetModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=10,
-                     clamp_len=-1,
-                     reuse_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     num_attention_heads=4,
-                     d_inner=128,
-                     num_hidden_layers=5,
-                     type_sequence_label_size=2,
-                     untie_r=True,
-                     bi_data=False,
-                     same_length=False,
-                     initializer_range=0.05,
-                     seed=1,
-                     type_vocab_size=2,
-            ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=10,
+            clamp_len=-1,
+            reuse_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            num_attention_heads=4,
+            d_inner=128,
+            num_hidden_layers=5,
+            type_sequence_label_size=2,
+            untie_r=True,
+            bi_data=False,
+            same_length=False,
+            initializer_range=0.05,
+            seed=1,
+            type_vocab_size=2,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -131,22 +141,44 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data,
                 initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size)
-
-            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+                num_labels=self.type_sequence_label_size,
+            )
+
+            return (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            )
 
         def set_seed(self):
             random.seed(self.seed)
             tf.random.set_seed(self.seed)
 
-        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def create_and_check_xlnet_base_model(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetModel(config)
 
-            inputs = {'input_ids': input_ids_1,
-                      'input_mask': input_mask,
-                      'token_type_ids': segment_ids}
+            inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
 
             _, _ = model(inputs)
 
@@ -165,30 +197,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             self.parent.assertEqual(len(no_mems_outputs), 1)
 
             self.parent.assertListEqual(
-                list(result["outputs"].shape),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_lm_head(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetLMHeadModel(config)
 
-            inputs_1 = {'input_ids': input_ids_1,
-                      'token_type_ids': segment_ids}
+            inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
 
             all_logits_1, mems_1 = model(inputs_1)
 
-            inputs_2 = {'input_ids': input_ids_2,
-                        'mems': mems_1,
-                        'token_type_ids': segment_ids}
+            inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
 
             all_logits_2, mems_2 = model(inputs_2)
 
-            inputs_3 = {'input_ids': input_ids_q,
-                        'perm_mask': perm_mask,
-                        'target_mapping': target_mapping}
+            inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
 
             logits, _ = model(inputs_3)
 
@@ -200,26 +240,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             }
 
             self.parent.assertListEqual(
-                list(result["all_logits_1"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
             self.parent.assertListEqual(
-                list(result["all_logits_2"].shape),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_qa(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetForQuestionAnsweringSimple(config)
 
-            inputs = {'input_ids': input_ids_1,
-                      'attention_mask': input_mask,
-                      'token_type_ids': segment_ids}
+            inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
             start_logits, end_logits, mems = model(inputs)
 
             result = {
@@ -228,18 +280,27 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 "mems": [m.numpy() for m in mems],
             }
 
-            self.parent.assertListEqual(
-                list(result["start_logits"].shape),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_sequence_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             model = TFXLNetForSequenceClassification(config)
 
             logits, mems_1 = model(input_ids_1)
@@ -249,42 +310,64 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 "logits": logits.numpy(),
             }
 
-            self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_for_token_classification(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ):
             config.num_labels = input_ids_1.shape[1]
             model = TFXLNetForTokenClassification(config)
-            inputs = {'input_ids': input_ids_1,
-                      'attention_mask': input_mask,
-                      # 'token_type_ids': token_type_ids
-                      }
+            inputs = {
+                "input_ids": input_ids_1,
+                "attention_mask": input_mask,
+                # 'token_type_ids': token_type_ids
+            }
             logits, mems_1 = model(inputs)
             result = {
                 "mems_1": [mem.numpy() for mem in mems_1],
                 "logits": logits.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["logits"].shape),
-                [self.batch_size, self.seq_length, config.num_labels])
+                list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
+            )
             self.parent.assertListEqual(
                 list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index acbe95fe4..f04205d4e 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -23,10 +23,10 @@ from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+    from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
     from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -40,27 +40,27 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
     test_resize_embeddings = False
 
     class TransfoXLModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=30,
-                     clamp_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     d_embed=32,
-                     num_attention_heads=4,
-                     d_head=8,
-                     d_inner=128,
-                     div_val=2,
-                     num_hidden_layers=5,
-                     scope=None,
-                     seed=1,
-                     ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=30,
+            clamp_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            d_embed=32,
+            num_attention_heads=4,
+            d_head=8,
+            d_inner=128,
+            div_val=2,
+            num_hidden_layers=5,
+            scope=None,
+            seed=1,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -100,7 +100,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                 d_head=self.d_head,
                 d_inner=self.d_inner,
                 div_val=self.div_val,
-                n_layer=self.num_hidden_layers)
+                n_layer=self.num_hidden_layers,
+            )
 
             return (config, input_ids_1, input_ids_2, lm_labels)
 
@@ -125,18 +126,19 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def check_transfo_xl_model_output(self, result):
             self.parent.assertListEqual(
-                list(result["hidden_states_1"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLLMHeadModel(config)
@@ -159,33 +161,30 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
             return outputs
 
         def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
-                list(result["loss_1"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
+            self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length])
             self.parent.assertListEqual(
-                list(result["loss_2"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
         self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index fcc2f4699..843693fd0 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -21,11 +21,17 @@ import unittest
 from transformers import is_torch_available
 
 if is_torch_available():
-    from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
-                                      XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
+    from transformers import (
+        XLMConfig,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForQuestionAnswering,
+        XLMForSequenceClassification,
+        XLMForQuestionAnsweringSimple,
+    )
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -33,42 +39,50 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class XLMModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
-                         XLMForSequenceClassification, XLMForQuestionAnsweringSimple) if is_torch_available() else ()
-
+    all_model_classes = (
+        (
+            XLMModel,
+            XLMWithLMHeadModel,
+            XLMForQuestionAnswering,
+            XLMForSequenceClassification,
+            XLMForQuestionAnsweringSimple,
+        )
+        if is_torch_available()
+        else ()
+    )
 
     class XLMModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_lengths=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     gelu_activation=True,
-                     sinusoidal_embeddings=False,
-                     causal=False,
-                     asm=False,
-                     n_langs=2,
-                     vocab_size=99,
-                     n_special=0,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     summary_type="last",
-                     use_proj=True,
-                     scope=None,
-                    ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_lengths=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            gelu_activation=True,
+            sinusoidal_embeddings=False,
+            causal=False,
+            asm=False,
+            n_langs=2,
+            vocab_size=99,
+            n_special=0,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            summary_type="last",
+            use_proj=True,
+            scope=None,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -105,7 +119,9 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
             input_lengths = None
             if self.use_input_lengths:
-                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+                input_lengths = (
+                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+                )  # small variation of seq_length
 
             token_type_ids = None
             if self.use_token_type_ids:
@@ -120,31 +136,49 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLMConfig(
-                 vocab_size=self.vocab_size,
-                 n_special=self.n_special,
-                 emb_dim=self.hidden_size,
-                 n_layers=self.num_hidden_layers,
-                 n_heads=self.num_attention_heads,
-                 dropout=self.hidden_dropout_prob,
-                 attention_dropout=self.attention_probs_dropout_prob,
-                 gelu_activation=self.gelu_activation,
-                 sinusoidal_embeddings=self.sinusoidal_embeddings,
-                 asm=self.asm,
-                 causal=self.causal,
-                 n_langs=self.n_langs,
-                 max_position_embeddings=self.max_position_embeddings,
-                 initializer_range=self.initializer_range,
-                 summary_type=self.summary_type,
-                 use_proj=self.use_proj)
-
-            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+                vocab_size=self.vocab_size,
+                n_special=self.n_special,
+                emb_dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                gelu_activation=self.gelu_activation,
+                sinusoidal_embeddings=self.sinusoidal_embeddings,
+                asm=self.asm,
+                causal=self.causal,
+                n_langs=self.n_langs,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+                summary_type=self.summary_type,
+                use_proj=self.use_proj,
+            )
+
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            )
 
         def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            self.parent.assertListEqual(list(result["loss"].size()), [])
+
+        def create_and_check_xlm_model(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMModel(config=config)
             model.to(torch_device)
             model.eval()
@@ -156,11 +190,20 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "sequence_output": sequence_output,
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
-
-
-        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_xlm_lm_head(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMWithLMHeadModel(config)
             model.to(torch_device)
             model.eval()
@@ -172,23 +215,29 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
-
-        def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+
+        def create_and_check_xlm_simple_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForQuestionAnsweringSimple(config)
             model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                       end_positions=sequence_labels)
+            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
             loss, start_logits, end_logits = outputs
 
             result = {
@@ -196,16 +245,21 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "start_logits": start_logits,
                 "end_logits": end_logits,
             }
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
-        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def create_and_check_xlm_qa(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForQuestionAnswering(config)
             model.to(torch_device)
             model.eval()
@@ -213,21 +267,26 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
             outputs = model(input_ids)
             start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels,
-                                         p_mask=input_mask)
-
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels)
+            outputs = model(
+                input_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+                p_mask=input_mask,
+            )
+
+            outputs = model(
+                input_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+            )
 
             (total_loss,) = outputs
 
-            outputs = model(input_ids, start_positions=sequence_labels,
-                                         end_positions=sequence_labels)
+            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
 
             (total_loss,) = outputs
 
@@ -240,27 +299,34 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "cls_logits": cls_logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top])
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()),
-                [self.batch_size, model.config.start_n_top])
+                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
                 list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
             self.parent.assertListEqual(
                 list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
-            self.parent.assertListEqual(
-                list(result["cls_logits"].size()),
-                [self.batch_size])
-
-
-        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
+            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
+
+        def create_and_check_xlm_sequence_classif(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            input_mask,
+        ):
             model = XLMForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
@@ -273,19 +339,24 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.type_sequence_label_size])
-
+                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_lengths,
-             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_lengths,
+                sequence_labels,
+                token_labels,
+                is_impossible_labels,
+                input_mask,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
             return config, inputs_dict
 
     def setUp(self):
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 6d218d6ef..487756a5c 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -26,11 +26,17 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
 
-    from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
-                              XLNetForTokenClassification, XLNetForQuestionAnswering)
+    from transformers import (
+        XLNetConfig,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForQuestionAnswering,
+    )
     from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
@@ -38,35 +44,44 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
-                    XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
+    all_model_classes = (
+        (
+            XLNetModel,
+            XLNetLMHeadModel,
+            XLNetForTokenClassification,
+            XLNetForSequenceClassification,
+            XLNetForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
     test_pruning = False
 
     class XLNetModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     mem_len=10,
-                     clamp_len=-1,
-                     reuse_len=15,
-                     is_training=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     cutoffs=[10, 50, 80],
-                     hidden_size=32,
-                     num_attention_heads=4,
-                     d_inner=128,
-                     num_hidden_layers=5,
-                     type_sequence_label_size=2,
-                     untie_r=True,
-                     bi_data=False,
-                     same_length=False,
-                     initializer_range=0.05,
-                     seed=1,
-                     type_vocab_size=2,
-            ):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            mem_len=10,
+            clamp_len=-1,
+            reuse_len=15,
+            is_training=True,
+            use_labels=True,
+            vocab_size=99,
+            cutoffs=[10, 50, 80],
+            hidden_size=32,
+            num_attention_heads=4,
+            d_inner=128,
+            num_hidden_layers=5,
+            type_sequence_label_size=2,
+            untie_r=True,
+            bi_data=False,
+            same_length=False,
+            initializer_range=0.05,
+            seed=1,
+            type_vocab_size=2,
+        ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -97,9 +112,13 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
+            perm_mask = torch.zeros(
+                self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device
+            )
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
+            target_mapping = torch.zeros(
+                self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device
+            )
             target_mapping[:, 0, -1] = 1.0  # predict last token
 
             sequence_labels = None
@@ -125,17 +144,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data,
                 initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size)
-
-            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels)
+                num_labels=self.type_sequence_label_size,
+            )
+
+            return (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+                token_labels,
+            )
 
         def set_seed(self):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_base_model(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetModel(config)
             model.to(torch_device)
             model.eval()
@@ -158,14 +203,28 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.parent.assertEqual(len(no_mems_outputs), 1)
 
             self.parent.assertListEqual(
-                list(result["outputs"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_base_model_with_att_output(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetModel(config)
             model.to(torch_device)
             model.eval()
@@ -177,15 +236,30 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.parent.assertEqual(len(attentions[0]), 2)
             self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
 
-        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_lm_head(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetLMHeadModel(config)
             model.to(torch_device)
             model.eval()
 
             loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
 
-            loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
+            loss_2, all_logits_2, mems_2 = model(
+                input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1
+            )
 
             logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
 
@@ -198,28 +272,39 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "all_logits_2": all_logits_2,
             }
 
+            self.parent.assertListEqual(list(result["loss_1"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss_1"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["all_logits_1"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
+            self.parent.assertListEqual(list(result["loss_2"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss_2"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["all_logits_2"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_qa(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForQuestionAnswering(config)
             model.to(torch_device)
             model.eval()
@@ -227,21 +312,26 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             outputs = model(input_ids_1)
             start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
 
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels,
-                                         p_mask=input_mask)
-
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels,
-                                         cls_index=sequence_labels,
-                                         is_impossible=is_impossible_labels)
+            outputs = model(
+                input_ids_1,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+                p_mask=input_mask,
+            )
+
+            outputs = model(
+                input_ids_1,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+                cls_index=sequence_labels,
+                is_impossible=is_impossible_labels,
+            )
 
             total_loss, mems = outputs
 
-            outputs = model(input_ids_1, start_positions=sequence_labels,
-                                         end_positions=sequence_labels)
+            outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels)
 
             total_loss, mems = outputs
 
@@ -255,30 +345,42 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "mems": mems,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
+                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top])
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()),
-                [self.batch_size, model.config.start_n_top])
+                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
+            )
             self.parent.assertListEqual(
                 list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
             self.parent.assertListEqual(
                 list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
-            self.parent.assertListEqual(
-                list(result["cls_logits"].size()),
-                [self.batch_size])
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
+            )
+            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-
-        def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
+
+        def create_and_check_xlnet_token_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForTokenClassification(config)
             model.to(torch_device)
             model.eval()
@@ -292,26 +394,48 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.type_sequence_label_size])
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
+        def create_and_check_xlnet_sequence_classif(
+            self,
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ):
             model = XLNetForSequenceClassification(config)
             model.to(torch_device)
             model.eval()
@@ -325,25 +449,34 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 "logits": logits,
             }
 
+            self.parent.assertListEqual(list(result["loss"].size()), [])
             self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.type_sequence_label_size])
+                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
+            )
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
+            )
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, segment_ids, lm_labels,
-                sequence_labels, is_impossible_labels, token_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids_1}
+            (
+                config,
+                input_ids_1,
+                input_ids_2,
+                input_ids_q,
+                perm_mask,
+                input_mask,
+                target_mapping,
+                segment_ids,
+                lm_labels,
+                sequence_labels,
+                is_impossible_labels,
+                token_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids_1}
             return config, inputs_dict
 
-
     def setUp(self):
         self.model_tester = XLNetModelTest.XLNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
index cc10ad590..0addcde1d 100644
--- a/transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -24,12 +24,14 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
 
-    from transformers import (AdamW,
-                              get_constant_schedule,
-                              get_constant_schedule_with_warmup,
-                              get_cosine_schedule_with_warmup,
-                              get_cosine_with_hard_restarts_schedule_with_warmup,
-                              get_linear_schedule_with_warmup)
+    from transformers import (
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
 
 from .tokenization_tests_commons import TemporaryDirectory
 from .utils import require_torch
@@ -42,6 +44,7 @@ def unwrap_schedule(scheduler, num_steps=10):
         lrs.append(scheduler.get_lr())
     return lrs
 
+
 def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
     lrs = []
     for step in range(num_steps):
@@ -49,16 +52,16 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
         lrs.append(scheduler.get_lr())
         if step == num_steps // 2:
             with TemporaryDirectory() as tmpdirname:
-                file_name = os.path.join(tmpdirname, 'schedule.bin')
+                file_name = os.path.join(tmpdirname, "schedule.bin")
                 torch.save(scheduler.state_dict(), file_name)
 
                 state_dict = torch.load(file_name)
                 scheduler.load_state_dict(state_dict)
     return lrs
 
+
 @require_torch
 class OptimizationTest(unittest.TestCase):
-
     def assertListAlmostEqual(self, list1, list2, tol):
         self.assertEqual(len(list1), len(list2))
         for a, b in zip(list1, list2):
@@ -74,7 +77,7 @@ class OptimizationTest(unittest.TestCase):
             loss = criterion(w, target)
             loss.backward()
             optimizer.step()
-            w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
+            w.grad.detach_()  # No zero_grad() function on simple tensors. we do it ourselves.
             w.grad.zero_()
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
@@ -82,7 +85,7 @@ class OptimizationTest(unittest.TestCase):
 @require_torch
 class ScheduleInitTest(unittest.TestCase):
     m = torch.nn.Linear(50, 50) if is_torch_available() else None
-    optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
+    optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
     num_steps = 10
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -93,7 +96,7 @@ class ScheduleInitTest(unittest.TestCase):
     def test_constant_scheduler(self):
         scheduler = get_constant_schedule(self.optimizer)
         lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [10.] * self.num_steps
+        expected_learning_rates = [10.0] * self.num_steps
         self.assertEqual(len(lrs[0]), 1)
         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
 
@@ -135,13 +138,17 @@ class ScheduleInitTest(unittest.TestCase):
         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 
     def test_warmup_cosine_hard_restart_scheduler(self):
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
+        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
+        )
         lrs = unwrap_schedule(scheduler, self.num_steps)
         expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
         self.assertEqual(len(lrs[0]), 1)
         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
 
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
+        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
+        )
         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 
diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py
index 515d12a15..e88ee971e 100644
--- a/transformers/tests/optimization_tf_test.py
+++ b/transformers/tests/optimization_tf_test.py
@@ -12,7 +12,7 @@ if is_tf_available():
     import tensorflow as tf
     from tensorflow.python.eager import context
     from tensorflow.python.framework import ops
-    from transformers import (create_optimizer, GradientAccumulator)
+    from transformers import create_optimizer, GradientAccumulator
 
 
 @require_tf
@@ -21,7 +21,7 @@ class OptimizationFTest(unittest.TestCase):
         self.assertEqual(len(list1), len(list2))
         for a, b in zip(list1, list2):
             self.assertAlmostEqual(a, b, delta=tol)
-    
+
     def testGradientAccumulator(self):
         accumulator = GradientAccumulator()
         accumulator([tf.constant([1.0, 2.0])])
@@ -42,8 +42,8 @@ class OptimizationFTest(unittest.TestCase):
         physical_devices = tf.config.experimental.list_physical_devices("CPU")
         tf.config.experimental.set_virtual_device_configuration(
             physical_devices[0],
-            [tf.config.experimental.VirtualDeviceConfiguration(),
-            tf.config.experimental.VirtualDeviceConfiguration()])
+            [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()],
+        )
 
         devices = tf.config.experimental.list_logical_devices(device_type="CPU")
         strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
@@ -87,4 +87,4 @@ class OptimizationFTest(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py
index 08a150777..3c258594d 100644
--- a/transformers/tests/pipelines_test.py
+++ b/transformers/tests/pipelines_test.py
@@ -6,58 +6,58 @@ from transformers import pipeline
 from transformers.tests.utils import require_tf, require_torch
 
 QA_FINETUNED_MODELS = {
-    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
 }
 
 TF_QA_FINETUNED_MODELS = {
-    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
-    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
+    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
 }
 
 TF_NER_FINETUNED_MODELS = {
     (
-        'bert-base-cased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+        "bert-base-cased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
     )
 }
 
 NER_FINETUNED_MODELS = {
     (
-        'bert-base-cased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+        "bert-base-cased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
     )
 }
 
 FEATURE_EXTRACT_FINETUNED_MODELS = {
-   ('bert-base-cased', 'bert-base-cased', None),
-   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+    ("bert-base-cased", "bert-base-cased", None),
+    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+    ("distilbert-base-uncased", "distilbert-base-uncased", None),
 }
 
 TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
-   ('bert-base-cased', 'bert-base-cased', None),
-   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+    ("bert-base-cased", "bert-base-cased", None),
+    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+    ("distilbert-base-uncased", "distilbert-base-uncased", None),
 }
 
 TF_TEXT_CLASSIF_FINETUNED_MODELS = {
     (
-        'bert-base-uncased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+        "bert-base-uncased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
     )
 }
 
 TEXT_CLASSIF_FINETUNED_MODELS = {
     (
-        'bert-base-uncased',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
-        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+        "bert-base-uncased",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
+        "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
     )
 }
 
@@ -91,54 +91,54 @@ class MonoColumnInputTestCase(unittest.TestCase):
 
     @require_torch
     def test_ner(self):
-        mandatory_keys = {'entity', 'word', 'score'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"entity", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in NER_FINETUNED_MODELS:
-            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_tf
     def test_tf_ner(self):
-        mandatory_keys = {'entity', 'word', 'score'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"entity", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
-            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_torch
     def test_sentiment_analysis(self):
-        mandatory_keys = {'label'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"label"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_tf
     def test_tf_sentiment_analysis(self):
-        mandatory_keys = {'label'}
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        mandatory_keys = {"label"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
 
     @require_torch
     def test_features_extraction(self):
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
 
     @require_tf
     def test_tf_features_extraction(self):
-        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
         invalid_inputs = [None]
         for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
             self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
 
 
@@ -165,46 +165,46 @@ class MultiColumnInputTestCase(unittest.TestCase):
 
     @require_torch
     def test_question_answering(self):
-        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        mandatory_output_keys = {"score", "answer", "start", "end"}
         valid_samples = [
-            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
             {
-                'question': 'In what field is HuggingFace working ?',
-                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
-            }
+                "question": "In what field is HuggingFace working ?",
+                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            },
         ]
         invalid_samples = [
-            {'question': '', 'context': 'This is a test to try empty question edge case'},
-            {'question': None, 'context': 'This is a test to try empty question edge case'},
-            {'question': 'What is does with empty context ?', 'context': ''},
-            {'question': 'What is does with empty context ?', 'context': None},
+            {"question": "", "context": "This is a test to try empty question edge case"},
+            {"question": None, "context": "This is a test to try empty question edge case"},
+            {"question": "What is does with empty context ?", "context": ""},
+            {"question": "What is does with empty context ?", "context": None},
         ]
 
         for tokenizer, model, config in QA_FINETUNED_MODELS:
-            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
             self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
 
     @require_tf
     def test_tf_question_answering(self):
-        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        mandatory_output_keys = {"score", "answer", "start", "end"}
         valid_samples = [
-            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
             {
-                'question': 'In what field is HuggingFace working ?',
-                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
-            }
+                "question": "In what field is HuggingFace working ?",
+                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            },
         ]
         invalid_samples = [
-            {'question': '', 'context': 'This is a test to try empty question edge case'},
-            {'question': None, 'context': 'This is a test to try empty question edge case'},
-            {'question': 'What is does with empty context ?', 'context': ''},
-            {'question': 'What is does with empty context ?', 'context': None},
+            {"question": "", "context": "This is a test to try empty question edge case"},
+            {"question": None, "context": "This is a test to try empty question edge case"},
+            {"question": "What is does with empty context ?", "context": ""},
+            {"question": "What is does with empty context ?", "context": None},
         ]
 
         for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
-            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
             self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_albert_test.py b/transformers/tests/tokenization_albert_test.py
index 59eb3bceb..7d7e793b5 100644
--- a/transformers/tests/tokenization_albert_test.py
+++ b/transformers/tests/tokenization_albert_test.py
@@ -17,12 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_albert import AlbertTokenizer, SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/spiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
+
 
 class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -39,27 +39,30 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"this is a test"
-        output_text = u"this is a test"
+        input_text = "this is a test"
+        output_text = "this is a test"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
 
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'])
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
+        )
         ids = tokenizer.convert_tokens_to_ids(tokens)
         self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.'])
+        self.assertListEqual(
+            back_tokens,
+            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
+        )
 
     def test_sequence_builders(self):
         tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
@@ -71,8 +74,10 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 0a894cac0..7d77bf5b2 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -48,5 +48,6 @@ class AutoTokenizerTest(unittest.TestCase):
         self.assertIsInstance(tokenizer, BertTokenizer)
         self.assertEqual(len(tokenizer), 12)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 545193c7c..02eb8c0a6 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -19,9 +19,12 @@ import unittest
 from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
-from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
-                                                     MecabTokenizer, CharacterTokenizer,
-                                                     VOCAB_FILES_NAMES)
+from transformers.tokenization_bert_japanese import (
+    BertJapaneseTokenizer,
+    MecabTokenizer,
+    CharacterTokenizer,
+    VOCAB_FILES_NAMES,
+)
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow, custom_tokenizers
@@ -35,9 +38,24 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
     def setUp(self):
         super(BertJapaneseTokenizationTest, self).setUp()
 
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
-            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "こんにちは",
+            "こん",
+            "にちは",
+            "ばんは",
+            "##こん",
+            "##にちは",
+            "##ばんは",
+            "世界",
+            "##世界",
+            "、",
+            "##、",
+            "。",
+            "##。",
+        ]
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
@@ -47,70 +65,63 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
-        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
-        self.assertListEqual(tokens,
-                             [u"こんにちは", u"、", u"世界", u"。",
-                              u"こん", u"##ばんは", u"、", u"世界", "。"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
-                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+        tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
 
     def test_mecab_tokenizer(self):
         tokenizer = MecabTokenizer()
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
 
     def test_mecab_tokenizer_lower(self):
         tokenizer = MecabTokenizer(do_lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
 
     def test_mecab_tokenizer_no_normalize(self):
         tokenizer = MecabTokenizer(normalize_text=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
-                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れ", "た", "　", "。"],
+        )
 
     def test_wordpiece_tokenizer(self):
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 
-        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
-                             [u"こんにちは"])
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
-                             [u"こん", u"##ばんは"])
+        self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
-                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+        self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
 
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
 
-        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -127,58 +138,51 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
     def setUp(self):
         super(BertJapaneseCharacterTokenizationTest, self).setUp()
 
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
-                                                     subword_tokenizer_type="character",
-                                                     **kwargs)
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
-        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file,
-                                         subword_tokenizer_type="character")
+        tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
 
-        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
-        self.assertListEqual(tokens,
-            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
-             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
-                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
-                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+        tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(
+            tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]
+        )
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
+        )
 
     def test_character_tokenizer(self):
-        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
-            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界" "、", "。"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
 
-        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
-                             [u"こ", u"ん", u"に", u"ち", u"は"])
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"])
 
-        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
-                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+        self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"])
 
     @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
 
-        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -186,6 +190,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
         # 2 is for "[CLS]", 3 is for "[SEP]"
         assert encoded_sentence == [2] + text + [3]
         assert encoded_pair == [2] + text + [3] + text_2 + [3]
-
-
-
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index c503ea5e1..bf023761a 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -18,15 +18,20 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_bert import (BasicTokenizer,
-                                                    BertTokenizer,
-                                                    WordpieceTokenizer,
-                                                    _is_control, _is_punctuation,
-                                                    _is_whitespace, VOCAB_FILES_NAMES)
+from transformers.tokenization_bert import (
+    BasicTokenizer,
+    BertTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+    VOCAB_FILES_NAMES,
+)
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
+
 class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = BertTokenizer
@@ -35,55 +40,61 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(BertTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ",", "low", "lowest",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"UNwant\u00E9d,running"
-        output_text = u"unwanted, running"
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(
-            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
-            [u"ah", u"\u535A", u"\u63A8", u"zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["hello", "!", "how", "are", "you", "?"])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
 
     def test_wordpiece_tokenizer(self):
-        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing"
-        ]
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
 
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
@@ -92,39 +103,36 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         self.assertListEqual(tokenizer.tokenize(""), [])
 
-        self.assertListEqual(
-            tokenizer.tokenize("unwanted running"),
-            ["un", "##want", "##ed", "runn", "##ing"])
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
 
-        self.assertListEqual(
-            tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
 
     def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(u" "))
-        self.assertTrue(_is_whitespace(u"\t"))
-        self.assertTrue(_is_whitespace(u"\r"))
-        self.assertTrue(_is_whitespace(u"\n"))
-        self.assertTrue(_is_whitespace(u"\u00A0"))
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
 
-        self.assertFalse(_is_whitespace(u"A"))
-        self.assertFalse(_is_whitespace(u"-"))
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
 
     def test_is_control(self):
-        self.assertTrue(_is_control(u"\u0005"))
+        self.assertTrue(_is_control("\u0005"))
 
-        self.assertFalse(_is_control(u"A"))
-        self.assertFalse(_is_control(u" "))
-        self.assertFalse(_is_control(u"\t"))
-        self.assertFalse(_is_control(u"\r"))
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
 
     def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation(u"-"))
-        self.assertTrue(_is_punctuation(u"$"))
-        self.assertTrue(_is_punctuation(u"`"))
-        self.assertTrue(_is_punctuation(u"."))
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
 
-        self.assertFalse(_is_punctuation(u"A"))
-        self.assertFalse(_is_punctuation(u" "))
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
 
     @slow
     def test_sequence_builders(self):
@@ -140,5 +148,5 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py
index ad16cf07f..04c9dec52 100644
--- a/transformers/tests/tokenization_ctrl_test.py
+++ b/transformers/tests/tokenization_ctrl_test.py
@@ -22,6 +22,7 @@ from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = CTRLTokenizer
@@ -30,13 +31,13 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(CTRLTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', '<unk>']
+        vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", 'a p', 'ap t</w>', 'r e', 'a d', 'ad apt</w>', '']
+        merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -47,23 +48,22 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"adapt react readapt apt"
-        output_text = u"adapt react readapt apt"
+        input_text = "adapt react readapt apt"
+        output_text = "adapt react readapt apt"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "adapt react readapt apt"
-        bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
+        bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split()
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
 
         input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index e815eca67..551f9e188 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -18,12 +18,13 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_distilbert import (DistilBertTokenizer)
+from transformers.tokenization_distilbert import DistilBertTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
 from .utils import slow
 
+
 class DistilBertTokenizationTest(BertTokenizationTest):
 
     tokenizer_class = DistilBertTokenizer
@@ -42,9 +43,10 @@ class DistilBertTokenizationTest(BertTokenizationTest):
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 
         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
-               text_2 + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index 5eae767bd..552b73416 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -23,6 +23,7 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = GPT2Tokenizer
@@ -31,16 +32,34 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(GPT2TokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "\u0120", "\u0120l", "\u0120n",
-                 "\u0120lo", "\u0120low", "er",
-                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -51,8 +70,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,8 +83,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + [tokenizer.unk_token]
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py
index 56aa219dd..c6a802b7b 100644
--- a/transformers/tests/tokenization_openai_test.py
+++ b/transformers/tests/tokenization_openai_test.py
@@ -31,15 +31,34 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(OpenAIGPTTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "w</w>", "r</w>", "t</w>",
-                 "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
         with open(self.merges_file, "w") as fp:
@@ -49,11 +68,10 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
 
@@ -64,9 +82,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + ["<unk>"]
         input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index 8ad0b5951..a1d9d5fb7 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -31,16 +31,34 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(RobertaTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "\u0120", "\u0120l", "\u0120n",
-                 "\u0120lo", "\u0120low", "er",
-                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
@@ -51,8 +69,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,19 +82,15 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + [tokenizer.unk_token]
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
     def roberta_dict_integration_testing(self):
         tokenizer = self.get_tokenizer()
 
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
         self.assertListEqual(
-            tokenizer.encode('Hello world!', add_special_tokens=False),
-            [0, 31414, 232, 328, 2]
-        )
-        self.assertListEqual(
-            tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
         )
 
     @slow
@@ -87,7 +101,9 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
-        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True
+        )
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
@@ -96,5 +112,5 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == encoded_pair_from_decode
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
index 0b4f960e3..09bc0267f 100644
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
@@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_t5 import (T5Tokenizer)
+from transformers.tokenization_t5 import T5Tokenizer
 from transformers.tokenization_xlnet import SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/test_sentencepiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
 
 class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -40,38 +40,76 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
         return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"This is a test"
-        output_text = u"This is a test"
+        input_text = "This is a test"
+        output_text = "This is a test"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-        ids = tokenizer.convert_tokens_to_ids(tokens)
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                602, 347, 347, 347, 3, 12, 66,
-                46, 72, 80, 6, 0, 4])
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                        u'<unk>', u'.'])
-
-
-if __name__ == '__main__':
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index c417d033d..ba8110108 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -26,19 +26,23 @@ if sys.version_info[0] == 2:
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+
         def __enter__(self):
             self.name = tempfile.mkdtemp()
             return self.name
+
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self.name)
+
+
 else:
     import pickle
+
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
 
 class CommonTestCases:
-
     class CommonTokenizerTester(unittest.TestCase):
 
         tokenizer_class = None
@@ -57,17 +61,23 @@ class CommonTestCases:
 
         def test_tokenizers_common_properties(self):
             tokenizer = self.get_tokenizer()
-            attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
-                                "pad_token", "cls_token", "mask_token"]
+            attributes_list = [
+                "bos_token",
+                "eos_token",
+                "unk_token",
+                "sep_token",
+                "pad_token",
+                "cls_token",
+                "mask_token",
+            ]
             for attr in attributes_list:
                 self.assertTrue(hasattr(tokenizer, attr))
                 self.assertTrue(hasattr(tokenizer, attr + "_id"))
 
             self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
-            self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
+            self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
 
-            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
-                                "added_tokens_decoder"]
+            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
             for attr in attributes_list:
                 self.assertTrue(hasattr(tokenizer, attr))
 
@@ -79,13 +89,13 @@ class CommonTestCases:
             # Now let's start the test
             tokenizer = self.get_tokenizer(max_len=42)
 
-            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+            before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
 
             with TemporaryDirectory() as tmpdirname:
                 tokenizer.save_pretrained(tmpdirname)
                 tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
 
-                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+                after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
                 self.assertListEqual(before_tokens, after_tokens)
 
                 self.assertEqual(tokenizer.max_len, 42)
@@ -96,12 +106,12 @@ class CommonTestCases:
             tokenizer = self.get_tokenizer()
             self.assertIsNotNone(tokenizer)
 
-            text = u"Munich and Berlin are nice cities"
+            text = "Munich and Berlin are nice cities"
             subwords = tokenizer.tokenize(text)
 
             with TemporaryDirectory() as tmpdirname:
 
-                filename = os.path.join(tmpdirname, u"tokenizer.bin")
+                filename = os.path.join(tmpdirname, "tokenizer.bin")
                 with open(filename, "wb") as handle:
                     pickle.dump(tokenizer, handle)
 
@@ -122,7 +132,7 @@ class CommonTestCases:
 
             toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
 
-            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
+            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
             added = tokenizer.add_tokens(new_toks)
             self.assertEqual(added, 2)
 
@@ -178,8 +188,7 @@ class CommonTestCases:
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
 
-            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                          'pad_token': "<<<<<|||>|>>>>|>"}
+            new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
             added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
             vocab_size_3 = tokenizer.vocab_size
             all_size_3 = len(tokenizer)
@@ -189,8 +198,9 @@ class CommonTestCases:
             self.assertEqual(added_toks_2, len(new_toks_2))
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                                      add_special_tokens=False)
+            tokens = tokenizer.encode(
+                ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+            )
             out_string = tokenizer.decode(tokens)
 
             self.assertGreaterEqual(len(tokens), 6)
@@ -242,7 +252,7 @@ class CommonTestCases:
         def test_encode_decode_with_spaces(self):
             tokenizer = self.get_tokenizer()
 
-            new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+            new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
             tokenizer.add_tokens(new_toks)
             input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
             encoded = tokenizer.encode(input, add_special_tokens=False)
@@ -264,7 +274,7 @@ class CommonTestCases:
 
             tokenizer = self.get_tokenizer()
 
-            if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
+            if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
                 information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
@@ -293,17 +303,19 @@ class CommonTestCases:
             sequence = tokenizer.encode(seq_0, add_special_tokens=False)
             num_added_tokens = tokenizer.num_added_tokens()
             total_length = len(sequence) + num_added_tokens
-            information = tokenizer.encode_plus(seq_0,
-                                                max_length=total_length - 2,
-                                                add_special_tokens=True,
-                                                stride=stride,
-                                                return_overflowing_tokens=True)
+            information = tokenizer.encode_plus(
+                seq_0,
+                max_length=total_length - 2,
+                add_special_tokens=True,
+                stride=stride,
+                return_overflowing_tokens=True,
+            )
 
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
 
             self.assertEqual(len(overflowing_tokens), 2 + stride)
-            self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
+            self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
             self.assertEqual(len(truncated_sequence), total_length - 2)
             self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
 
@@ -320,24 +332,35 @@ class CommonTestCases:
             sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
             truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
                 tokenizer.encode(seq_0, add_special_tokens=False),
-                tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
+                tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
             )
 
-            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
-                                                stride=stride, truncation_strategy='only_second',
-                                                return_overflowing_tokens=True)
-            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
-                                                                add_special_tokens=True, stride=stride,
-                                                                truncation_strategy='only_first',
-                                                                return_overflowing_tokens=True)
+            information = tokenizer.encode_plus(
+                seq_0,
+                seq_1,
+                max_length=len(sequence) - 2,
+                add_special_tokens=True,
+                stride=stride,
+                truncation_strategy="only_second",
+                return_overflowing_tokens=True,
+            )
+            information_first_truncated = tokenizer.encode_plus(
+                seq_0,
+                seq_1,
+                max_length=len(sequence) - 2,
+                add_special_tokens=True,
+                stride=stride,
+                truncation_strategy="only_first",
+                return_overflowing_tokens=True,
+            )
 
             truncated_sequence = information["input_ids"]
             overflowing_tokens = information["overflowing_tokens"]
             overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
 
             self.assertEqual(len(overflowing_tokens), 2 + stride)
-            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
-            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
+            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
+            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
             self.assertEqual(len(truncated_sequence), len(sequence) - 2)
             self.assertEqual(truncated_sequence, truncated_second_sequence)
 
@@ -361,37 +384,47 @@ class CommonTestCases:
 
             # Testing single inputs
             encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [
+                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+            ]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing inputs pairs
-            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
-                                                                                                         add_special_tokens=False)
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
-                                                          return_special_tokens_mask=True)
+            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
+                sequence_1, add_special_tokens=False
+            )
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
 
-            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
+            filtered_sequence = [
+                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+            ]
             filtered_sequence = [x for x in filtered_sequence if x is not None]
             self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing with already existing special tokens
             if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
-                tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
-            encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
-                                                          add_special_tokens=True,
-                                                          return_special_tokens_mask=True)
+                tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
+            encoded_sequence_dict = tokenizer.encode_plus(
+                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+            )
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
-            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
+            special_tokens_mask = tokenizer.get_special_tokens_mask(
+                encoded_sequence_w_special, already_has_special_tokens=True
+            )
             self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
             self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
 
@@ -406,7 +439,9 @@ class CommonTestCases:
             tokenizer.padding_side = "right"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(
+                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
+            )
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
@@ -415,7 +450,9 @@ class CommonTestCases:
             tokenizer.padding_side = "left"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+            padded_sequence = tokenizer.encode(
+                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
+            )
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
@@ -446,38 +483,48 @@ class CommonTestCases:
             token_type_padding_idx = tokenizer.pad_token_type_id
 
             encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
-            input_ids = encoded_sequence['input_ids']
-            token_type_ids = encoded_sequence['token_type_ids']
-            attention_mask = encoded_sequence['attention_mask']
-            special_tokens_mask = encoded_sequence['special_tokens_mask']
+            input_ids = encoded_sequence["input_ids"]
+            token_type_ids = encoded_sequence["token_type_ids"]
+            attention_mask = encoded_sequence["attention_mask"]
+            special_tokens_mask = encoded_sequence["special_tokens_mask"]
             sequence_length = len(input_ids)
 
             # Test right padding
             tokenizer.padding_side = "right"
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
-            padded_input_ids = padded_sequence['input_ids']
-            padded_token_type_ids = padded_sequence['token_type_ids']
-            padded_attention_mask = padded_sequence['attention_mask']
-            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence = tokenizer.encode_plus(
+                sequence,
+                max_length=sequence_length + padding_size,
+                pad_to_max_length=True,
+                return_special_tokens_mask=True,
+            )
+            padded_input_ids = padded_sequence["input_ids"]
+            padded_token_type_ids = padded_sequence["token_type_ids"]
+            padded_attention_mask = padded_sequence["attention_mask"]
+            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
             padded_sequence_length = len(padded_input_ids)
 
             assert sequence_length + padding_size == padded_sequence_length
             assert input_ids + [padding_idx] * padding_size == padded_input_ids
             assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
-            assert attention_mask + [0] * padding_size == padded_attention_mask 
-            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
+            assert attention_mask + [0] * padding_size == padded_attention_mask
+            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
 
             # Test left padding
             tokenizer.padding_side = "left"
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
-            padded_input_ids = padded_sequence['input_ids']
-            padded_token_type_ids = padded_sequence['token_type_ids']
-            padded_attention_mask = padded_sequence['attention_mask']
-            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
+            padded_sequence = tokenizer.encode_plus(
+                sequence,
+                max_length=sequence_length + padding_size,
+                pad_to_max_length=True,
+                return_special_tokens_mask=True,
+            )
+            padded_input_ids = padded_sequence["input_ids"]
+            padded_token_type_ids = padded_sequence["token_type_ids"]
+            padded_attention_mask = padded_sequence["attention_mask"]
+            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
             padded_sequence_length = len(padded_input_ids)
 
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + input_ids == padded_input_ids
             assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
-            assert [0] * padding_size + attention_mask == padded_attention_mask 
-            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask 
\ No newline at end of file
+            assert [0] * padding_size + attention_mask == padded_attention_mask
+            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index 5495ebd3a..8b737283d 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -37,45 +37,53 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(TransfoXLTokenizationTest, self).setUp()
 
         vocab_tokens = [
-            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
-            "running", ",", "low", "l",
+            "<unk>",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "unwanted",
+            "wa",
+            "un",
+            "running",
+            ",",
+            "low",
+            "l",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_tokenizer(self, **kwargs):
-        kwargs['lower_case'] = True
+        kwargs["lower_case"] = True
         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"<unk> UNwanted , running"
-        output_text = u"<unk> unwanted, running"
+        input_text = "<unk> UNwanted , running"
+        output_text = "<unk> unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        tokens = tokenizer.tokenize("<unk> UNwanted , running")
         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
-            ["hello", "!", "how", "are", "you", "?"])
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
 
     def test_full_tokenizer_no_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
-            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index ff3f80ff7..4fa92c44b 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -24,8 +24,8 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer
 
 from .utils import slow
 
-class TokenizerUtilsTest(unittest.TestCase):
 
+class TokenizerUtilsTest(unittest.TestCase):
     def check_tokenizer_from_pretrained(self, tokenizer_class):
         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
         for model_name in s3_models[:1]:
@@ -46,5 +46,6 @@ class TokenizerUtilsTest(unittest.TestCase):
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index 7582a4666..e9aa2b7d0 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -23,6 +23,7 @@ from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
+
 class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = XLMTokenizer
@@ -31,15 +32,34 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(XLMTokenizationTest, self).setUp()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "w</w>", "r</w>", "t</w>",
-                 "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
         with open(self.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
         with open(self.merges_file, "w") as fp:
@@ -49,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"lower newer"
-        output_text = u"lower newer"
+        input_text = "lower newer"
+        output_text = "lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
@@ -64,8 +84,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         input_tokens = tokens + ["<unk>"]
         input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
     @slow
     def test_sequence_builders(self):
@@ -80,5 +99,6 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_sentence == [1] + text + [1]
         assert encoded_pair == [1] + text + [1] + text_2 + [1]
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index b68495a79..32482449a 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                    'fixtures/test_sentencepiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
 
 class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -40,55 +40,135 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
-        input_text = u"This is a test"
-        output_text = u"This is a test"
+        input_text = "This is a test"
+        output_text = "This is a test"
         return input_text, output_text
 
-
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-        ids = tokenizer.convert_tokens_to_ids(tokens)
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                602, 347, 347, 347, 3, 12, 66,
-                46, 72, 80, 6, 0, 4])
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                        u'<unk>', u'.'])
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
 
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "",
+                "i",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
 
     def test_tokenizer_no_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
-                                      u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
 
     @slow
     def test_sequence_builders(self):
@@ -104,5 +184,5 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_pair == text + [4] + text_2 + [4, 3]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index ba0e19f42..aab5e5a8a 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -27,6 +27,7 @@ def parse_flag_from_env(key, default=False):
             raise ValueError("If set, {} must be yes or no.".format(key))
     return _value
 
+
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 6b92d0721..b03b3ca11 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for ALBERT model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 from .tokenization_utils import PreTrainedTokenizer
 import logging
@@ -24,34 +23,34 @@ import os
 from shutil import copyfile
 
 logger = logging.getLogger(__name__)
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
-        'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
-        'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
-        'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
-        'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
-        'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
-        'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
-        'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
+    "vocab_file": {
+        "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
+        "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
+        "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
+        "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
+        "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
+        "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
+        "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
+        "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'albert-base-v1': 512,
-    'albert-large-v1': 512,
-    'albert-xlarge-v1': 512,
-    'albert-xxlarge-v1': 512,
-    'albert-base-v2': 512,
-    'albert-large-v2': 512,
-    'albert-xlarge-v2': 512,
-    'albert-xxlarge-v2': 512,
+    "albert-base-v1": 512,
+    "albert-large-v1": 512,
+    "albert-xlarge-v1": 512,
+    "albert-xxlarge-v1": 512,
+    "albert-base-v2": 512,
+    "albert-large-v2": 512,
+    "albert-xlarge-v2": 512,
+    "albert-xxlarge-v2": 512,
 }
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
+
 
 class AlbertTokenizer(PreTrainedTokenizer):
     """
@@ -59,18 +58,36 @@ class AlbertTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file,
-                 do_lower_case=True, remove_space=True, keep_accents=False,
-                 bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
-                 pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]", **kwargs):
-        super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
-                                             unk_token=unk_token, sep_token=sep_token,
-                                             pad_token=pad_token, cls_token=cls_token,
-                                             mask_token=mask_token, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(AlbertTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
@@ -78,8 +95,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
@@ -103,24 +122,26 @@ class AlbertTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
     def preprocess_text(self, inputs):
         if self.remove_space:
-            outputs = ' '.join(inputs.strip().split())
+            outputs = " ".join(inputs.strip().split())
         else:
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
         if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode('utf-8')
+            outputs = outputs.decode("utf-8")
 
         if not self.keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
         if self.do_lower_case:
             outputs = outputs.lower()
 
@@ -133,7 +154,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
         if six.PY2 and isinstance(text, unicode):
-            text = text.encode('utf-8')
+            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -141,9 +162,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(
-                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                     if len(cur_pieces[0]) == 1:
                         cur_pieces = cur_pieces[1:]
@@ -159,7 +179,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in new_pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             new_pieces = ret_pieces
 
@@ -173,12 +193,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         token = self.sp_model.IdToPiece(index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -213,8 +233,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -244,7 +266,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index 5377bd48c..5d36fdcba 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -35,6 +35,7 @@ from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
+
 class AutoTokenizer(object):
     r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
         that will be instantiated as one of the tokenizer classes of the library
@@ -62,9 +63,12 @@ class AutoTokenizer(object):
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
+
     def __init__(self):
-        raise EnvironmentError("AutoTokenizer is designed to be instantiated "
-            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError(
+            "AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
+        )
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
@@ -125,34 +129,38 @@ class AutoTokenizer(object):
             tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
 
         """
-        if 't5' in pretrained_model_name_or_path:
+        if "t5" in pretrained_model_name_or_path:
             return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'distilbert' in pretrained_model_name_or_path:
+        elif "distilbert" in pretrained_model_name_or_path:
             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'albert' in pretrained_model_name_or_path:
+        elif "albert" in pretrained_model_name_or_path:
             return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'camembert' in pretrained_model_name_or_path:
+        elif "camembert" in pretrained_model_name_or_path:
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlm-roberta' in pretrained_model_name_or_path:
+        elif "xlm-roberta" in pretrained_model_name_or_path:
             return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
+        elif "roberta" in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert-base-japanese' in pretrained_model_name_or_path:
+        elif "bert-base-japanese" in pretrained_model_name_or_path:
             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
+        elif "bert" in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
+        elif "openai-gpt" in pretrained_model_name_or_path:
             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
+        elif "gpt2" in pretrained_model_name_or_path:
             return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
+        elif "transfo-xl" in pretrained_model_name_or_path:
             return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
+        elif "xlnet" in pretrained_model_name_or_path:
             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
+        elif "xlm" in pretrained_model_name_or_path:
             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'ctrl' in pretrained_model_name_or_path:
+        elif "ctrl" in pretrained_model_name_or_path:
             return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+        raise ValueError(
+            "Unrecognized model identifier in {}. Should contains one of "
+            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+            "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(
+                pretrained_model_name_or_path
+            )
+        )
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index edc26d88c..7b3705cc1 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -26,69 +26,68 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-        'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-        'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-        'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-        'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-        'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-        'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-        'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-        'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-        'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-        'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+    "vocab_file": {
+        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-    'bert-base-german-cased': 512,
-    'bert-large-uncased-whole-word-masking': 512,
-    'bert-large-cased-whole-word-masking': 512,
-    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
-    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
-    'bert-base-cased-finetuned-mrpc': 512,
-    'bert-base-german-dbmdz-cased': 512,
-    'bert-base-german-dbmdz-uncased': 512,
-    'bert-base-finnish-cased-v1': 512,
-    'bert-base-finnish-uncased-v1': 512,
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "bert-base-finnish-cased-v1": 512,
+    "bert-base-finnish-uncased-v1": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'bert-base-uncased': {'do_lower_case': True},
-    'bert-large-uncased': {'do_lower_case': True},
-    'bert-base-cased': {'do_lower_case': False},
-    'bert-large-cased': {'do_lower_case': False},
-    'bert-base-multilingual-uncased': {'do_lower_case': True},
-    'bert-base-multilingual-cased': {'do_lower_case': False},
-    'bert-base-chinese': {'do_lower_case': False},
-    'bert-base-german-cased': {'do_lower_case': False},
-    'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
-    'bert-large-cased-whole-word-masking': {'do_lower_case': False},
-    'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
-    'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
-    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
-    'bert-base-german-dbmdz-cased': {'do_lower_case': False},
-    'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
-    'bert-base-finnish-cased-v1': {'do_lower_case': False},
-    'bert-base-finnish-uncased-v1': {'do_lower_case': True},
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "bert-base-finnish-uncased-v1": {"do_lower_case": True},
 }
 
 
@@ -98,7 +97,7 @@ def load_vocab(vocab_file):
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
     for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
+        token = token.rstrip("\n")
         vocab[token] = index
     return vocab
 
@@ -132,9 +131,20 @@ class BertTokenizer(PreTrainedTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        **kwargs
+    ):
         """Constructs a BertTokenizer.
 
         Args:
@@ -152,24 +162,29 @@ class BertTokenizer(PreTrainedTokenizer):
                 This should likely be deactivated for Japanese:
                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
-        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                            pad_token=pad_token, cls_token=cls_token,
-                                            mask_token=mask_token, **kwargs)
+        super(BertTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
         self.do_basic_tokenize = do_basic_tokenize
         if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                  never_split=never_split,
-                                                  tokenize_chinese_chars=tokenize_chinese_chars)
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+            )
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
@@ -196,7 +211,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -231,8 +246,10 @@ class BertTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -258,16 +275,18 @@ class BertTokenizer(PreTrainedTokenizer):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
         else:
             vocab_file = vocab_path
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
                     index = token_index
-                writer.write(token + u'\n')
+                writer.write(token + "\n")
                 index += 1
         return (vocab_file,)
 
@@ -382,14 +401,16 @@ class BasicTokenizer(object):
         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
         # space-separated words, so they are not treated specially and handled
         # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-                (cp >= 0x3400 and cp <= 0x4DBF) or  #
-                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-                (cp >= 0x2B820 and cp <= 0x2CEAF) or
-                (cp >= 0xF900 and cp <= 0xFAFF) or  #
-                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
             return True
 
         return False
@@ -399,7 +420,7 @@ class BasicTokenizer(object):
         output = []
         for char in text:
             cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
                 continue
             if _is_whitespace(char):
                 output.append(" ")
@@ -499,8 +520,7 @@ def _is_punctuation(char):
     # Characters such as "^", "$", and "`" are not in the Unicode
     # Punctuation class but we treat them as punctuation anyways, for
     # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
         return True
     cat = unicodedata.category(char)
     if cat.startswith("P"):
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 0ff45cbfe..48b9b04b4 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -28,46 +28,45 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
-        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
-        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
-        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    "vocab_file": {
+        "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'bert-base-japanese': 512,
-    'bert-base-japanese-whole-word-masking': 512,
-    'bert-base-japanese-char': 512,
-    'bert-base-japanese-char-whole-word-masking': 512
+    "bert-base-japanese": 512,
+    "bert-base-japanese-whole-word-masking": 512,
+    "bert-base-japanese-char": 512,
+    "bert-base-japanese-char-whole-word-masking": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'bert-base-japanese': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'wordpiece'
+    "bert-base-japanese": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
     },
-    'bert-base-japanese-whole-word-masking':{
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'wordpiece'
+    "bert-base-japanese-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "wordpiece",
     },
-    'bert-base-japanese-char': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'character'
+    "bert-base-japanese-char": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
+    },
+    "bert-base-japanese-char-whole-word-masking": {
+        "do_lower_case": False,
+        "word_tokenizer_type": "mecab",
+        "subword_tokenizer_type": "character",
     },
-    'bert-base-japanese-char-whole-word-masking': {
-        'do_lower_case': False,
-        'word_tokenizer_type': 'mecab',
-        'subword_tokenizer_type': 'character'
-    }
 }
 
 
@@ -79,11 +78,22 @@ class BertJapaneseTokenizer(BertTokenizer):
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=False,
-                 do_word_tokenize=True, do_subword_tokenize=True,
-                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
-                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
-                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        do_word_tokenize=True,
+        do_subword_tokenize=True,
+        word_tokenizer_type="basic",
+        subword_tokenizer_type="wordpiece",
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
         """Constructs a MecabBertTokenizer.
 
         Args:
@@ -100,56 +110,53 @@ class BertJapaneseTokenizer(BertTokenizer):
             **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                 Type of subword tokenizer.
         """
-        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                            pad_token=pad_token, cls_token=cls_token,
-                                            mask_token=mask_token, **kwargs)
+        super(BertTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
         self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
 
         self.do_word_tokenize = do_word_tokenize
         if do_word_tokenize:
-            if word_tokenizer_type == 'basic':
-                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                     never_split=never_split,
-                                                     tokenize_chinese_chars=False)
-            elif word_tokenizer_type == 'mecab':
-                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
-                                                     never_split=never_split)
+            if word_tokenizer_type == "basic":
+                self.word_tokenizer = BasicTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
+                )
+            elif word_tokenizer_type == "mecab":
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split)
             else:
-                raise ValueError(
-                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+                raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
 
         self.do_subword_tokenize = do_subword_tokenize
         if do_subword_tokenize:
-            if subword_tokenizer_type == 'wordpiece':
-                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
-                                                            unk_token=self.unk_token)
-            elif subword_tokenizer_type == 'character':
-                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
-                                                            unk_token=self.unk_token)
+            if subword_tokenizer_type == "wordpiece":
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+            elif subword_tokenizer_type == "character":
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
             else:
-                raise ValueError(
-                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
-
+                raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
 
     def _tokenize(self, text):
         if self.do_word_tokenize:
-            tokens = self.word_tokenizer.tokenize(text,
-                                                  never_split=self.all_special_tokens)
+            tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
         else:
             tokens = [text]
 
         if self.do_subword_tokenize:
-            split_tokens = [sub_token for token in tokens
-                            for sub_token in self.subword_tokenizer.tokenize(token)]
+            split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
         else:
             split_tokens = tokens
 
@@ -177,27 +184,28 @@ class MecabTokenizer(object):
         self.normalize_text = normalize_text
 
         import MeCab
+
         self.mecab = MeCab.Tagger()
 
     def tokenize(self, text, never_split=None, **kwargs):
         """Tokenizes a piece of text."""
         if self.normalize_text:
-            text = unicodedata.normalize('NFKC', text)
+            text = unicodedata.normalize("NFKC", text)
 
         never_split = self.never_split + (never_split if never_split is not None else [])
         tokens = []
 
         if six.PY2:
-            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+            mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
         else:
             mecab_output = self.mecab.parse(text)
 
         cursor = 0
-        for line in mecab_output.split('\n'):
-            if line == 'EOS':
+        for line in mecab_output.split("\n"):
+            if line == "EOS":
                 break
 
-            token, _ = line.split('\t')
+            token, _ = line.split("\t")
             token_start = text.index(token, cursor)
             token_end = token_start + len(token)
             if self.do_lower_case and token not in never_split:
@@ -240,7 +248,7 @@ class CharacterTokenizer(object):
             A list of characters.
         """
         if self.normalize_text:
-            text = unicodedata.normalize('NFKC', text)
+            text = unicodedata.normalize("NFKC", text)
 
         output_tokens = []
         for i, char in enumerate(text):
diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py
index 4c4615eb3..c1e80e0e0 100644
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for Camembert model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,19 +25,19 @@ from .tokenization_xlnet import SPIECE_UNDERLINE
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
+    "vocab_file": {
+        "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'camembert-base': None,
+    "camembert-base": None,
 }
 
+
 class CamembertTokenizer(PreTrainedTokenizer):
     """
         Adapted from RobertaTokenizer and XLNetTokenizer
@@ -46,17 +45,36 @@ class CamembertTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
-                 additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
-        super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                 sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                 mask_token=mask_token, additional_special_tokens=additional_special_tokens,
-                                                 **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
+        **kwargs
+    ):
+        super(CamembertTokenizer, self).__init__(
+            max_len=512,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
         self.sp_model = spm.SentencePieceProcessor()
@@ -64,9 +82,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
         self.vocab_file = vocab_file
         # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
         # sentencepiece vocabulary (this is the case for <s> and </s>
-        self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
+        self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
         self.fairseq_offset = len(self.fairseq_tokens_to_ids)
-        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -100,8 +118,10 @@ class CamembertTokenizer(PreTrainedTokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
@@ -148,7 +168,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -158,7 +178,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 219f17c40..2ce2bbf09 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Salesforce CTRL."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -27,23 +26,17 @@ from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",
-    },
-    'merges_file':
-    {
-        'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",
-    },
+    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",},
+    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'ctrl': 256,
+    "ctrl": 256,
 }
 
 CONTROL_CODES = {
@@ -104,6 +97,7 @@ CONTROL_CODES = {
     "multilingual": 128406,
 }
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
@@ -118,11 +112,13 @@ def get_pairs(word):
     pairs = set(pairs)
     return pairs
 
+
 class CTRLTokenizer(PreTrainedTokenizer):
     """
     CTRL BPE tokenizer. Peculiarities:
         - Byte-Pair-Encoding
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -130,14 +126,18 @@ class CTRLTokenizer(PreTrainedTokenizer):
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[1:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -150,14 +150,14 @@ class CTRLTokenizer(PreTrainedTokenizer):
         if token in self.cache:
             return self.cache[token]
         word = tuple(token)
-        word = tuple(list(word[:-1]) + [word[-1]+'</w>'])
+        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
         pairs = get_pairs(word)
 
         if not pairs:
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -172,8 +172,8 @@ class CTRLTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -184,7 +184,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = '@@ '.join(word)
+        word = "@@ ".join(word)
         word = word[:-4]
         self.cache[token] = word
         return word
@@ -194,10 +194,10 @@ class CTRLTokenizer(PreTrainedTokenizer):
         """
         split_tokens = []
 
-        words = re.findall(r'\S+\n?', text)
+        words = re.findall(r"\S+\n?", text)
 
         for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(' ')])
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
         return split_tokens
 
     def _convert_token_to_id(self, token):
@@ -210,7 +210,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace('@@ ', '').strip()
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -218,21 +218,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index 2f245d71d..7fed1e405 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -26,23 +26,22 @@ from .tokenization_bert import BertTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
-        'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    "vocab_file": {
+        "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
+        "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'distilbert-base-uncased': 512,
-    'distilbert-base-uncased-distilled-squad': 512,
-    'distilbert-base-german-cased': 512,
-    'distilbert-base-multilingual-cased': 512,
+    "distilbert-base-uncased": 512,
+    "distilbert-base-uncased-distilled-squad": 512,
+    "distilbert-base-german-cased": 512,
+    "distilbert-base-multilingual-cased": 512,
 }
 
 
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 68c610186..b6a0e7b78 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -31,42 +30,42 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
+
 from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
-        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
-        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
-        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
+    "vocab_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
     },
-    'merges_file':
-    {
-        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
-        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
-        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
-        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
+    "merges_file": {
+        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
+        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'gpt2': 1024,
-    'gpt2-medium': 1024,
-    'gpt2-large': 1024,
-    'gpt2-xl': 1024,
-    'distilgpt2': 1024,
+    "gpt2": 1024,
+    "gpt2-medium": 1024,
+    "gpt2-large": 1024,
+    "gpt2-xl": 1024,
+    "distilgpt2": 1024,
 }
 
+
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -80,17 +79,20 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     """
     _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
     cs = bs[:]
     n = 0
-    for b in range(2**8):
+    for b in range(2 ** 8):
         if b not in bs:
             bs.append(b)
-            cs.append(2**8+n)
+            cs.append(2 ** 8 + n)
             n += 1
     cs = [_chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
@@ -103,6 +105,7 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
@@ -112,15 +115,28 @@ class GPT2Tokenizer(PreTrainedTokenizer):
           Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
           the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
-                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        **kwargs
+    ):
         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
@@ -128,8 +144,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
         bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
@@ -151,7 +167,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -166,8 +182,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -178,7 +194,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
+        word = " ".join(word)
         self.cache[token] = word
         return word
 
@@ -189,15 +205,19 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                     Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
         """
         if add_prefix_space:
-            text = ' ' + text
+            text = " " + text
 
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+                token = "".join(
+                    self.byte_encoder[ord(b)] for b in token
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
             else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+                token = "".join(
+                    self.byte_encoder[b] for b in token.encode("utf-8")
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
     def _convert_token_to_id(self, token):
@@ -210,8 +230,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
     def save_vocabulary(self, save_directory):
@@ -219,21 +239,23 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index a4c64b702..d8f7549ed 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -28,25 +27,20 @@ from .tokenization_bert import BasicTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
-    },
-    'merges_file':
-    {
-        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
-    },
+    "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",},
+    "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'openai-gpt': 512,
+    "openai-gpt": 512,
 }
 
+
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -59,27 +53,30 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 def text_standardize(text):
     """
     fixes some issues the spacy tokenizer had on books corpus
     also does some whitespace standardization
     """
-    text = text.replace('—', '-')
-    text = text.replace('–', '-')
-    text = text.replace('―', '-')
-    text = text.replace('…', '...')
-    text = text.replace('´', "'")
-    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub(r'\s*\n\s*', ' \n ', text)
-    text = re.sub(r'[^\S\n]+', ' ', text)
+    text = text.replace("—", "-")
+    text = text.replace("–", "-")
+    text = text.replace("―", "-")
+    text = text.replace("…", "...")
+    text = text.replace("´", "'")
+    text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
+    text = re.sub(r"\s*\n\s*", " \n ", text)
+    text = re.sub(r"[^\S\n]+", " ", text)
     return text.strip()
 
+
 class OpenAIGPTTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer. Peculiarities:
         - lower case all inputs
         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -87,12 +84,17 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
 
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         try:
             import ftfy
             from spacy.lang.en import English
+
             _nlp = English()
             self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
             self.fix_text = ftfy.fix_text
@@ -103,9 +105,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[1:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -115,16 +117,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return len(self.encoder)
 
     def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
         if token in self.cache:
             return self.cache[token]
         pairs = get_pairs(word)
 
         if not pairs:
-            return token+'</w>'
+            return token + "</w>"
 
         while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -139,8 +141,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -151,9 +153,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
-        if word == '\n  </w>':
-            word = '\n</w>'
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
         self.cache[token] = word
         return word
 
@@ -164,12 +166,12 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             # Using BERT's BasicTokenizer
             text = self.nlp.tokenize(text)
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
         else:
             # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
             text = self.nlp(text_standardize(self.fix_text(text)))
             for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")])
         return split_tokens
 
     def _convert_token_to_id(self, token):
@@ -182,7 +184,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -190,21 +192,23 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
+            writer.write("#version: 0.2\n")
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index b44e00499..eae8b638f 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import sys
 import json
@@ -33,41 +32,40 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
-        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
-        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
-        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+    "vocab_file": {
+        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
+        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
     },
-    'merges_file':
-    {
-        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
-        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
-        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
-        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+    "merges_file": {
+        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
+        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'roberta-base': 512,
-    'roberta-large': 512,
-    'roberta-large-mnli': 512,
-    'distilroberta-base': 512,
-    'roberta-base-openai-detector': 512,
-    'roberta-large-openai-detector': 512,
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+    "distilroberta-base": 512,
+    "roberta-base-openai-detector": 512,
+    "roberta-large-openai-detector": 512,
 }
 
 
@@ -80,16 +78,38 @@ class RobertaTokenizer(GPT2Tokenizer):
           Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
-        super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
-                                               bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                               mask_token=mask_token, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        super(RobertaTokenizer, self).__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
 
@@ -124,8 +144,10 @@ class RobertaTokenizer(GPT2Tokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 9fd37b67c..3b70d4085 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -26,26 +26,25 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to pretrained vocabulary URL for all the model shortcut names.
 ####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+    "vocab_file": {
+        "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
     }
 }
 
@@ -53,13 +52,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    't5-small': 512,
-    't5-base': 512,
-    't5-large': 512,
-    't5-3b': 512,
-    't5-11b': 512,
+    "t5-small": 512,
+    "t5-base": 512,
+    "t5-large": 512,
+    "t5-3b": 512,
+    "t5-11b": 512,
 }
 
+
 class T5Tokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
@@ -71,28 +71,43 @@ class T5Tokenizer(PreTrainedTokenizer):
                 (like in T5 preprocessing
                 see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
-                 pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=100,
+        additional_special_tokens=None,
+        **kwargs
+    ):
         # Add extra_ids to the special token list
         if extra_ids > 0:
             if additional_special_tokens is None:
                 additional_special_tokens = []
-            additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
+            additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
 
-        super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
-                                          pad_token=pad_token, additional_special_tokens=additional_special_tokens,
-                                          **kwargs)
+        super(T5Tokenizer, self).__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use T5Tokenizer:"
-                           "https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use T5Tokenizer:"
+                "https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.vocab_file = vocab_file
         self._extra_ids = extra_ids
@@ -114,8 +129,10 @@ class T5Tokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
@@ -132,7 +149,7 @@ class T5Tokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             pieces = ret_pieces
 
@@ -140,8 +157,8 @@ class T5Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        if token.startswith(u"<extra_id_"):
-            l = re.match(r'<extra_id_(\d+)>', token)
+        if token.startswith("<extra_id_"):
+            l = re.match(r"<extra_id_(\d+)>", token)
             num = int(l.group(1))
             return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
@@ -151,9 +168,9 @@ class T5Tokenizer(PreTrainedTokenizer):
         if index < self.sp_model.get_piece_size():
             token = self.sp_model.IdToPiece(index)
         else:
-            token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
+            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
@@ -168,7 +185,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
index 8d5a0ce9d..b2f59625f 100644
--- a/transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -16,8 +16,7 @@
 """ Tokenization classes for Transformer XL model.
     Adapted from https://github.com/kimiyoung/transformer-xl.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import glob
 import logging
@@ -44,42 +43,58 @@ except ImportError:
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'pretrained_vocab_file':
-    {
-        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+    "pretrained_vocab_file": {
+        "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'transfo-xl-wt103': None,
+    "transfo-xl-wt103": None,
 }
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
+    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
 }
-CORPUS_NAME = 'corpus.bin'
+CORPUS_NAME = "corpus.bin"
+
 
 class TransfoXLTokenizer(PreTrainedTokenizer):
     """
     Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
-                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
-                 never_split=None, unk_token="<unk>", eos_token="<eos>",
-                 additional_special_tokens=["<formula>"], **kwargs):
-        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
-                                                 additional_special_tokens=additional_special_tokens,
-                                                 **kwargs)
-
-        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+    def __init__(
+        self,
+        special=None,
+        min_freq=0,
+        max_size=None,
+        lower_case=False,
+        delimiter=None,
+        vocab_file=None,
+        pretrained_vocab_file=None,
+        never_split=None,
+        unk_token="<unk>",
+        eos_token="<eos>",
+        additional_special_tokens=["<formula>"],
+        **kwargs
+    ):
+        super(TransfoXLTokenizer, self).__init__(
+            unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
+        )
+
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
 
         if never_split is None:
             never_split = self.all_special_tokens
@@ -106,14 +121,15 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             self.build_vocab()
 
     def count_file(self, path, verbose=False, add_eos=False):
-        if verbose: logger.info('counting file {} ...'.format(path))
+        if verbose:
+            logger.info("counting file {} ...".format(path))
         assert os.path.exists(path)
 
         sents = []
-        with open(path, 'r', encoding='utf-8') as f:
+        with open(path, "r", encoding="utf-8") as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info('    line {}'.format(idx))
+                    logger.info("    line {}".format(idx))
                 symbols = self.tokenize(line, add_eos=add_eos)
                 self.counter.update(symbols)
                 sents.append(symbols)
@@ -124,42 +140,42 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         """
             sents : a list of sentences, each a list of tokenized symbols
         """
-        if verbose: logger.info('counting {} sents ...'.format(len(sents)))
+        if verbose:
+            logger.info("counting {} sents ...".format(len(sents)))
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info('    line {}'.format(idx))
+                logger.info("    line {}".format(idx))
             self.counter.update(symbols)
 
     def _build_from_file(self, vocab_file):
         self.idx2sym = []
         self.sym2idx = OrderedDict()
 
-        with open(vocab_file, 'r', encoding='utf-8') as f:
+        with open(vocab_file, "r", encoding="utf-8") as f:
             for line in f:
                 symb = line.strip().split()[0]
                 self.add_symbol(symb)
-        if '<UNK>' in self.sym2idx:
-            self.unk_idx = self.sym2idx['<UNK>']
-        elif '<unk>' in self.sym2idx:
-            self.unk_idx = self.sym2idx['<unk>']
+        if "<UNK>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<UNK>"]
+        elif "<unk>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<unk>"]
         else:
-            raise ValueError('No <unkown> token in vocabulary')
+            raise ValueError("No <unkown> token in vocabulary")
 
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
         torch.save(self.__dict__, vocab_file)
         return (vocab_file,)
 
     def build_vocab(self):
         if self.vocab_file:
-            logger.info('building vocab from {}'.format(self.vocab_file))
+            logger.info("building vocab from {}".format(self.vocab_file))
             self._build_from_file(self.vocab_file)
-            logger.info('final vocab size {}'.format(len(self)))
+            logger.info("final vocab size {}".format(len(self)))
         else:
-            logger.info('building vocab with min_freq={}, max_size={}'.format(
-                self.min_freq, self.max_size))
+            logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size))
             self.idx2sym = []
             self.sym2idx = OrderedDict()
 
@@ -167,23 +183,22 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
                 self.add_special(sym)
 
             for sym, cnt in self.counter.most_common(self.max_size):
-                if cnt < self.min_freq: break
+                if cnt < self.min_freq:
+                    break
                 self.add_symbol(sym)
 
-            logger.info('final vocab size {} from {} unique tokens'.format(
-                len(self), len(self.counter)))
+            logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter)))
 
-    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
-            add_double_eos=False):
-        if verbose: logger.info('encoding file {} ...'.format(path))
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
+        if verbose:
+            logger.info("encoding file {} ...".format(path))
         assert os.path.exists(path)
         encoded = []
-        with open(path, 'r', encoding='utf-8') as f:
+        with open(path, "r", encoding="utf-8") as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info('    line {}'.format(idx))
-                symbols = self.tokenize(line, add_eos=add_eos,
-                    add_double_eos=add_double_eos)
+                    logger.info("    line {}".format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
                 encoded.append(self.convert_to_tensor(symbols))
 
         if ordered:
@@ -192,11 +207,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         return encoded
 
     def encode_sents(self, sents, ordered=False, verbose=False):
-        if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
+        if verbose:
+            logger.info("encoding {} sents ...".format(len(sents)))
         encoded = []
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info('    line {}'.format(idx))
+                logger.info("    line {}".format(idx))
             encoded.append(self.convert_to_tensor(symbols))
 
         if ordered:
@@ -208,7 +224,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         if sym not in self.sym2idx:
             self.idx2sym.append(sym)
             self.sym2idx[sym] = len(self.idx2sym) - 1
-            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+            setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym])
 
     def add_symbol(self, sym):
         if sym not in self.sym2idx:
@@ -217,7 +233,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
 
     def _convert_id_to_token(self, idx):
         """Converts an id in a token (BPE) using the vocab."""
-        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
+        assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx)
         return self.idx2sym[idx]
 
     def _convert_token_to_id(self, sym):
@@ -227,19 +243,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         else:
             # logger.info('encounter unk {}'.format(sym))
             # assert '<eos>' not in sym
-            if hasattr(self, 'unk_idx'):
+            if hasattr(self, "unk_idx"):
                 return self.sym2idx.get(sym, self.unk_idx)
             # Backward compatibility with pre-trained models
-            elif '<unk>' in self.sym2idx:
-                return self.sym2idx['<unk>']
-            elif '<UNK>' in self.sym2idx:
-                return self.sym2idx['<UNK>']
+            elif "<unk>" in self.sym2idx:
+                return self.sym2idx["<unk>"]
+            elif "<UNK>" in self.sym2idx:
+                return self.sym2idx["<UNK>"]
             else:
-                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
+                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).strip()
+        out_string = " ".join(tokens).strip()
         return out_string
 
     def convert_to_tensor(self, symbols):
@@ -256,21 +272,21 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             line = line.lower()
 
         # empty delimiter '' will evaluate False
-        if self.delimiter == '':
+        if self.delimiter == "":
             symbols = line
         else:
             symbols = line.split(self.delimiter)
 
-        if add_double_eos: # lm1b
-            return ['<S>'] + symbols + ['<S>']
+        if add_double_eos:  # lm1b
+            return ["<S>"] + symbols + ["<S>"]
         elif add_eos:
-            return symbols + ['<eos>']
+            return symbols + ["<eos>"]
         else:
             return symbols
 
 
 class LMOrderedIterator(object):
-    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
         """
             data -- LongTensor -- the LongTensor is strictly ordered
         """
@@ -293,14 +309,15 @@ class LMOrderedIterator(object):
         self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
 
     def get_batch(self, i, bptt=None):
-        if bptt is None: bptt = self.bptt
+        if bptt is None:
+            bptt = self.bptt
         seq_len = min(bptt, self.data.size(0) - 1 - i)
 
         end_idx = i + seq_len
         beg_idx = max(0, i - self.ext_len)
 
         data = self.data[beg_idx:end_idx]
-        target = self.data[i+1:i+1+seq_len]
+        target = self.data[i + 1 : i + 1 + seq_len]
 
         data_out = data.transpose(0, 1).contiguous().to(self.device)
         target_out = target.transpose(0, 1).contiguous().to(self.device)
@@ -315,7 +332,7 @@ class LMOrderedIterator(object):
         max_len = self.bptt + max_deviation * std
         i = start
         while True:
-            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
             bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
             data, target, seq_len = self.get_batch(i, bptt)
             i += seq_len
@@ -328,7 +345,7 @@ class LMOrderedIterator(object):
 
 
 class LMShuffledIterator(object):
-    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
         """
             data -- list[LongTensor] -- there is no order among the LongTensors
         """
@@ -343,8 +360,7 @@ class LMShuffledIterator(object):
 
     def get_sent_stream(self):
         # index iterator
-        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
-            else np.array(range(len(self.data)))
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
 
         # sentence iterator
         for idx in epoch_indices:
@@ -376,10 +392,8 @@ class LMShuffledIterator(object):
                         # number of new tokens to fill in
                         n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
                         # first n_retain tokens are retained from last batch
-                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
-                            streams[i][:n_new]
-                        target[n_filled:n_filled+n_new, i] = \
-                            streams[i][1:n_new+1]
+                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
+                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
                         streams[i] = streams[i][n_new:]
                         n_filled += n_new
                 except StopIteration:
@@ -408,8 +422,7 @@ class LMShuffledIterator(object):
 
 
 class LMMultiFileIterator(LMShuffledIterator):
-    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
-        shuffle=False):
+    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
 
         self.paths = paths
         self.vocab = vocab
@@ -460,15 +473,16 @@ class TransfoXLCorpus(object):
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
+                    ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
                     pretrained_model_name_or_path,
-                    corpus_file))
+                    corpus_file,
+                )
+            )
             return None
         if resolved_corpus_file == corpus_file:
             logger.info("loading corpus file {}".format(corpus_file))
         else:
-            logger.info("loading corpus file {} from cache at {}".format(
-                corpus_file, resolved_corpus_file))
+            logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file))
 
         # Instantiate tokenizer.
         corpus = cls(*inputs, **kwargs)
@@ -494,83 +508,78 @@ class TransfoXLCorpus(object):
     def build_corpus(self, path, dataset):
         self.dataset = dataset
 
-        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
-            self.vocab.count_file(os.path.join(path, 'train.txt'))
-            self.vocab.count_file(os.path.join(path, 'valid.txt'))
-            self.vocab.count_file(os.path.join(path, 'test.txt'))
-        elif self.dataset == 'wt103':
-            self.vocab.count_file(os.path.join(path, 'train.txt'))
-        elif self.dataset == 'lm1b':
+        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+            self.vocab.count_file(os.path.join(path, "valid.txt"))
+            self.vocab.count_file(os.path.join(path, "test.txt"))
+        elif self.dataset == "wt103":
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+        elif self.dataset == "lm1b":
             train_path_pattern = os.path.join(
-                path, '1-billion-word-language-modeling-benchmark-r13output',
-                'training-monolingual.tokenized.shuffled', 'news.en-*')
+                path,
+                "1-billion-word-language-modeling-benchmark-r13output",
+                "training-monolingual.tokenized.shuffled",
+                "news.en-*",
+            )
             train_paths = glob.glob(train_path_pattern)
             # the vocab will load from file when build_vocab() is called
 
         self.vocab.build_vocab()
 
-        if self.dataset in ['ptb', 'wt2', 'wt103']:
-            self.train = self.vocab.encode_file(
-                os.path.join(path, 'train.txt'), ordered=True)
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=True)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=True)
-        elif self.dataset in ['enwik8', 'text8']:
-            self.train = self.vocab.encode_file(
-                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
-        elif self.dataset == 'lm1b':
+        if self.dataset in ["ptb", "wt2", "wt103"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
+        elif self.dataset in ["enwik8", "text8"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
+        elif self.dataset == "lm1b":
             self.train = train_paths
-            self.valid = self.vocab.encode_file(
-                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
-            self.test = self.vocab.encode_file(
-                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
 
     def get_iterator(self, split, *args, **kwargs):
-        if split == 'train':
-            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+        if split == "train":
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                 data_iter = LMOrderedIterator(self.train, *args, **kwargs)
-            elif self.dataset == 'lm1b':
-                kwargs['shuffle'] = True
+            elif self.dataset == "lm1b":
+                kwargs["shuffle"] = True
                 data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
-        elif split in ['valid', 'test']:
-            data = self.valid if split == 'valid' else self.test
-            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+        elif split in ["valid", "test"]:
+            data = self.valid if split == "valid" else self.test
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
                 data_iter = LMOrderedIterator(data, *args, **kwargs)
-            elif self.dataset == 'lm1b':
+            elif self.dataset == "lm1b":
                 data_iter = LMShuffledIterator(data, *args, **kwargs)
 
         return data_iter
 
 
 def get_lm_corpus(datadir, dataset):
-    fn = os.path.join(datadir, 'cache.pt')
-    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    fn = os.path.join(datadir, "cache.pt")
+    fn_pickle = os.path.join(datadir, "cache.pkl")
     if os.path.exists(fn):
-        logger.info('Loading cached dataset...')
+        logger.info("Loading cached dataset...")
         corpus = torch.load(fn_pickle)
     elif os.path.exists(fn):
-        logger.info('Loading cached dataset from pickle...')
+        logger.info("Loading cached dataset from pickle...")
         with open(fn, "rb") as fp:
             corpus = pickle.load(fp)
     else:
-        logger.info('Producing dataset {}...'.format(dataset))
+        logger.info("Producing dataset {}...".format(dataset))
         kwargs = {}
-        if dataset in ['wt103', 'wt2']:
-            kwargs['special'] = ['<eos>']
-            kwargs['lower_case'] = False
-        elif dataset == 'ptb':
-            kwargs['special'] = ['<eos>']
-            kwargs['lower_case'] = True
-        elif dataset == 'lm1b':
-            kwargs['special'] = []
-            kwargs['lower_case'] = False
-            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
-        elif dataset in ['enwik8', 'text8']:
+        if dataset in ["wt103", "wt2"]:
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = False
+        elif dataset == "ptb":
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = True
+        elif dataset == "lm1b":
+            kwargs["special"] = []
+            kwargs["lower_case"] = False
+            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
+        elif dataset in ["enwik8", "text8"]:
             pass
 
         corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 33a59643f..f848785ee 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -34,9 +33,10 @@ if is_torch_available():
 
 logger = logging.getLogger(__name__)
 
-SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
-ADDED_TOKENS_FILE = 'added_tokens.json'
-TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
 
 class PreTrainedTokenizer(object):
     """ Base class for all tokenizers.
@@ -69,14 +69,22 @@ class PreTrainedTokenizer(object):
 
         - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
     """
+
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
     pretrained_init_configuration = {}
     max_model_input_sizes = {}
 
-    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
-                                 "pad_token", "cls_token", "mask_token",
-                                 "additional_special_tokens"]
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
 
     padding_side = "right"
 
@@ -227,8 +235,8 @@ class PreTrainedTokenizer(object):
         self.max_len = max_len if max_len is not None else int(1e12)
 
         # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
-        self.padding_side = kwargs.pop('padding_side', self.padding_side)
-        
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+
         # Added tokens
         self.added_tokens_encoder = {}
         self.unique_added_tokens_encoder = set()
@@ -240,13 +248,14 @@ class PreTrainedTokenizer(object):
 
         for key, value in kwargs.items():
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == 'additional_special_tokens':
-                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)) and all(
+                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                    )
                 else:
                     assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
                 setattr(self, key, value)
 
-
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         r"""
@@ -302,13 +311,12 @@ class PreTrainedTokenizer(object):
         """
         return cls._from_pretrained(*inputs, **kwargs)
 
-
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        resume_download = kwargs.pop('resume_download', False)
-        proxies = kwargs.pop('proxies', None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
 
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
@@ -317,15 +325,19 @@ class PreTrainedTokenizer(object):
             # Get the vocabulary from AWS S3 bucket
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-            if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+            if (
+                cls.pretrained_init_configuration
+                and pretrained_model_name_or_path in cls.pretrained_init_configuration
+            ):
                 init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
         else:
             # Get the vocabulary from local files
             logger.info(
                 "Model name '{}' not found in model shortcut name list ({}). "
                 "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
-                    pretrained_model_name_or_path, ', '.join(s3_models),
-                    pretrained_model_name_or_path))
+                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
+                )
+            )
 
             # Look for the tokenizer main vocabulary files
             for file_id, file_name in cls.vocab_files_names.items():
@@ -340,14 +352,15 @@ class PreTrainedTokenizer(object):
                     full_file_name = pretrained_model_name_or_path
                 else:
                     full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
-                
+
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files
-            additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
-                                      'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
-                                      }
+            additional_files_names = {
+                "added_tokens_file": ADDED_TOKENS_FILE,
+                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+            }
 
             # If a path to a file was provided, get the parent directory
             saved_directory = pretrained_model_name_or_path
@@ -366,9 +379,12 @@ class PreTrainedTokenizer(object):
                     "Model name '{}' was not found in tokenizers model name list ({}). "
                     "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {} but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path, ', '.join(s3_models),
                         pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values())))
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()),
+                    )
+                )
 
         # Get files from url, cache, or disk depending on the case
         try:
@@ -377,17 +393,27 @@ class PreTrainedTokenizer(object):
                 if file_path is None:
                     resolved_vocab_files[file_id] = None
                 else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
+                    resolved_vocab_files[file_id] = cached_path(
+                        file_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                    )
         except EnvironmentError:
             if pretrained_model_name_or_path in s3_models:
                 msg = "Couldn't reach server at '{}' to download vocabulary files."
             else:
-                msg = "Model name '{}' was not found in tokenizers model name list ({}). " \
-                    "We assumed '{}' was a path or url to a directory containing vocabulary files " \
+                msg = (
+                    "Model name '{}' was not found in tokenizers model name list ({}). "
+                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path, ', '.join(s3_models),
                         pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values()))
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()),
+                    )
+                )
 
             raise EnvironmentError(msg)
 
@@ -395,16 +421,15 @@ class PreTrainedTokenizer(object):
             if file_path == resolved_vocab_files[file_id]:
                 logger.info("loading file {}".format(file_path))
             else:
-                logger.info("loading file {} from cache at {}".format(
-                    file_path, resolved_vocab_files[file_id]))
+                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
 
         # Prepare tokenizer initialization kwargs
         # Did we saved some inputs and kwargs to reload ?
-        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
         if tokenizer_config_file is not None:
             with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                 init_kwargs = json.load(tokenizer_config_handle)
-            saved_init_inputs = init_kwargs.pop('init_inputs', ())
+            saved_init_inputs = init_kwargs.pop("init_inputs", ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
         else:
@@ -419,11 +444,11 @@ class PreTrainedTokenizer(object):
             # wont index sequences longer than the number of positional embeddings
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
             if max_len is not None and isinstance(max_len, (int, float)):
-                init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
+                init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
 
         # Merge resolved_vocab_files arguments in init_kwargs.
-        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
-        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
         for args_name, file_path in resolved_vocab_files.items():
             if args_name not in init_kwargs:
                 init_kwargs[args_name] = file_path
@@ -438,8 +463,10 @@ class PreTrainedTokenizer(object):
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
         except OSError:
-            OSError("Unable to load vocabulary from file. "
-                    "Please check that the provided vocabulary is accessible and not corrupted.")
+            OSError(
+                "Unable to load vocabulary from file. "
+                "Please check that the provided vocabulary is accessible and not corrupted."
+            )
 
         # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
         tokenizer.init_inputs = init_inputs
@@ -449,13 +476,12 @@ class PreTrainedTokenizer(object):
         if added_tokens_file is not None:
             with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                 added_tok_encoder = json.load(added_tokens_handle)
-            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
 
         return tokenizer
 
-
     def save_pretrained(self, save_directory):
         """ Save the tokenizer vocabulary files together with:
                 - added tokens,
@@ -476,28 +502,27 @@ class PreTrainedTokenizer(object):
         tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
-        tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+        tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
         for file_id in self.vocab_files_names.keys():
             tokenizer_config.pop(file_id, None)
 
-        with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
+        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(tokenizer_config, ensure_ascii=False))
 
-        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
 
-        with open(added_tokens_file, 'w', encoding='utf-8') as f:
+        with open(added_tokens_file, "w", encoding="utf-8") as f:
             if self.added_tokens_encoder:
                 out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
             else:
-                out_str = u"{}"
+                out_str = "{}"
             f.write(out_str)
 
         vocab_files = self.save_vocabulary(save_directory)
 
         return vocab_files + (special_tokens_map_file, added_tokens_file)
 
-
     def save_vocabulary(self, save_directory):
         """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
             and special token mappings.
@@ -506,17 +531,14 @@ class PreTrainedTokenizer(object):
         """
         raise NotImplementedError
 
-
     def vocab_size(self):
         """ Size of the base vocabulary (without the added tokens) """
         raise NotImplementedError
 
-
     def __len__(self):
         """ Size of the full vocabulary with the added tokens """
         return self.vocab_size + len(self.added_tokens_encoder)
 
-
     def add_tokens(self, new_tokens):
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the
@@ -544,16 +566,18 @@ class PreTrainedTokenizer(object):
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
-            if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
+            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                 token = token.lower()
-            if token != self.unk_token and \
-                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
-                    token not in to_add_tokens:
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in to_add_tokens
+            ):
                 to_add_tokens.append(token)
                 logger.info("Adding %s to the vocabulary", token)
 
         added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
-        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
         self.added_tokens_encoder.update(added_tok_encoder)
         self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
         self.added_tokens_decoder.update(added_tok_decoder)
@@ -622,8 +646,10 @@ class PreTrainedTokenizer(object):
         added_tokens = 0
         for key, value in special_tokens_dict.items():
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
-            if key == 'additional_special_tokens':
-                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(
+                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                )
                 added_tokens += self.add_tokens(value)
             else:
                 assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
@@ -633,7 +659,6 @@ class PreTrainedTokenizer(object):
 
         return added_tokens
 
-
     def tokenize(self, text, **kwargs):
         """ Converts a string in a sequence of tokens (string), using the tokenizer.
             Split in words for word-based vocabulary or sub-words for sub-word-based
@@ -649,14 +674,10 @@ class PreTrainedTokenizer(object):
         def lowercase_text(t):
             # convert non-special tokens to lowercase
             escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
-            pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \
-                      r'(.+?)'
-            return re.sub(
-                pattern,
-                lambda m: m.groups()[0] or m.groups()[1].lower(),
-                t)
-
-        if self.init_kwargs.get('do_lower_case', False):
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
+
+        if self.init_kwargs.get("do_lower_case", False):
             text = lowercase_text(text)
 
         def split_on_token(tok, text):
@@ -694,9 +715,14 @@ class PreTrainedTokenizer(object):
                         tokenized_text += [sub_text]
                 text_list = tokenized_text
 
-            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \
-                    if token not in self.unique_added_tokens_encoder
-                    else [token] for token in tokenized_text)))
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token, **kwargs) if token not in self.unique_added_tokens_encoder else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
 
         added_tokens = self.unique_added_tokens_encoder
         tokenized_text = split_on_tokens(added_tokens, text)
@@ -737,16 +763,18 @@ class PreTrainedTokenizer(object):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self,
-               text,
-               text_pair=None,
-               add_special_tokens=True,
-               max_length=None,
-               stride=0,
-               truncation_strategy='longest_first',
-               pad_to_max_length=False,
-               return_tensors=None,
-               **kwargs):
+    def encode(
+        self,
+        text,
+        text_pair=None,
+        add_special_tokens=True,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        **kwargs
+    ):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
 
@@ -781,32 +809,36 @@ class PreTrainedTokenizer(object):
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
         """
-        encoded_inputs = self.encode_plus(text,
-                                          text_pair=text_pair,
-                                          max_length=max_length,
-                                          add_special_tokens=add_special_tokens,
-                                          stride=stride,
-                                          truncation_strategy=truncation_strategy,
-                                          pad_to_max_length=pad_to_max_length,
-                                          return_tensors=return_tensors,
-                                          **kwargs)
+        encoded_inputs = self.encode_plus(
+            text,
+            text_pair=text_pair,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            pad_to_max_length=pad_to_max_length,
+            return_tensors=return_tensors,
+            **kwargs
+        )
 
         return encoded_inputs["input_ids"]
 
-    def encode_plus(self,
-                    text,
-                    text_pair=None,
-                    add_special_tokens=True,
-                    max_length=None,
-                    stride=0,
-                    truncation_strategy='longest_first',
-                    pad_to_max_length=False,
-                    return_tensors=None,
-                    return_token_type_ids=True,
-                    return_attention_mask=True,
-                    return_overflowing_tokens=False,
-                    return_special_tokens_mask=False,
-                    **kwargs):
+    def encode_plus(
+        self,
+        text,
+        text_pair=None,
+        add_special_tokens=True,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -874,34 +906,40 @@ class PreTrainedTokenizer(object):
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
-                raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
 
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
-        return self.prepare_for_model(first_ids,
-                                      pair_ids=second_ids,
-                                      max_length=max_length,
-                                      pad_to_max_length=pad_to_max_length,
-                                      add_special_tokens=add_special_tokens,
-                                      stride=stride,
-                                      truncation_strategy=truncation_strategy,
-                                      return_tensors=return_tensors,
-                                      return_attention_mask=return_attention_mask,
-                                      return_token_type_ids=return_token_type_ids,
-                                      return_overflowing_tokens=return_overflowing_tokens,
-                                      return_special_tokens_mask=return_special_tokens_mask)
-
-    def batch_encode_plus(self,
-                          batch_text_or_text_pairs=None,
-                          add_special_tokens=False,
-                          max_length=None,
-                          stride=0,
-                          truncation_strategy='longest_first',
-                          return_tensors=None,
-                          return_input_lengths=False,
-                          return_attention_masks=False,
-                          **kwargs):
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            max_length=max_length,
+            pad_to_max_length=pad_to_max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            return_tensors=return_tensors,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+        )
+
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs=None,
+        add_special_tokens=False,
+        max_length=None,
+        stride=0,
+        truncation_strategy="longest_first",
+        return_tensors=None,
+        return_input_lengths=False,
+        return_attention_masks=False,
+        **kwargs
+    ):
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional information:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -933,12 +971,19 @@ class PreTrainedTokenizer(object):
                 ids, pair_ids = ids_or_pair_ids
             else:
                 ids, pair_ids = ids_or_pair_ids, None
-            outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length,
-                                       stride=stride, truncation_strategy=truncation_strategy, return_tensors=None)
+            outputs = self.encode_plus(
+                ids,
+                pair_ids,
+                add_special_tokens=add_special_tokens,
+                max_length=max_length,
+                stride=stride,
+                truncation_strategy=truncation_strategy,
+                return_tensors=None,
+            )
 
             # Append the non-padded length to the output
             if return_input_lengths:
-                outputs['input_len'] = len(outputs['input_ids'])
+                outputs["input_len"] = len(outputs["input_ids"])
 
             for key, value in outputs.items():
                 if key not in batch_outputs:
@@ -946,11 +991,11 @@ class PreTrainedTokenizer(object):
                 batch_outputs[key].append(value)
 
         # Compute longest sequence size
-        max_seq_len = max(map(len, batch_outputs['input_ids']))
+        max_seq_len = max(map(len, batch_outputs["input_ids"]))
 
         if return_attention_masks:
             # Allow the model to not give any special attention to padded input
-            batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']]
+            batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]]
 
         if return_tensors is not None:
 
@@ -958,34 +1003,48 @@ class PreTrainedTokenizer(object):
             for key, value in batch_outputs.items():
 
                 padded_value = value
-                if key != 'input_len':
+                if key != "input_len":
                     # Padding handle
-                    padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value]
+                    padded_value = [
+                        v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v))
+                        for v in padded_value
+                    ]
 
-                if return_tensors == 'tf' and is_tf_available():
+                if return_tensors == "tf" and is_tf_available():
                     batch_outputs[key] = tf.constant(padded_value)
-                elif return_tensors == 'pt' and is_torch_available():
+                elif return_tensors == "pt" and is_torch_available():
                     batch_outputs[key] = torch.tensor(padded_value)
                 elif return_tensors is not None:
-                    logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
+                    logger.warning(
+                        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                            return_tensors
+                        )
+                    )
 
         # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
         if return_attention_masks:
             if is_tf_available():
-                batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1)
+                batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1)
             else:
-                batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1)
+                batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1)
 
         return batch_outputs
 
-    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
-                          truncation_strategy='longest_first',
-                          pad_to_max_length=False,
-                          return_tensors=None,
-                          return_token_type_ids=True,
-                          return_attention_mask=True,
-                          return_overflowing_tokens=False,
-                          return_special_tokens_mask=False):
+    def prepare_for_model(
+        self,
+        ids,
+        pair_ids=None,
+        max_length=None,
+        add_special_tokens=True,
+        stride=0,
+        truncation_strategy="longest_first",
+        pad_to_max_length=False,
+        return_tensors=None,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+    ):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
         It adds special tokens, truncates
@@ -1050,10 +1109,13 @@ class PreTrainedTokenizer(object):
         # Handle max sequence length
         total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
         if max_length and total_len > max_length:
-            ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
-                                                                        num_tokens_to_remove=total_len-max_length,
-                                                                        truncation_strategy=truncation_strategy,
-                                                                        stride=stride)
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
             if return_overflowing_tokens:
                 encoded_inputs["overflowing_tokens"] = overflowing_tokens
                 encoded_inputs["num_truncated_tokens"] = total_len - max_length
@@ -1081,54 +1143,64 @@ class PreTrainedTokenizer(object):
                 encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
 
         if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
-            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
-                           "for this model ({} > {}). Running this sequence through the model will result in "
-                           "indexing errors".format(len(ids), self.max_len))
-                           
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum sequence length "
+                "for this model ({} > {}). Running this sequence through the model will result in "
+                "indexing errors".format(len(ids), self.max_len)
+            )
+
         needs_to_be_padded = pad_to_max_length and (
-            max_length and len(encoded_inputs["input_ids"]) < max_length
-            or 
-            max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+            max_length
+            and len(encoded_inputs["input_ids"]) < max_length
+            or max_length is None
+            and len(encoded_inputs["input_ids"]) < self.max_len
+            and self.max_len <= 10000
         )
 
         if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
+            logger.warning(
+                "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
+            )
 
         if needs_to_be_padded:
             difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
 
-            if self.padding_side == 'right':
+            if self.padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
                 if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-            elif self.padding_side == 'left':
+            elif self.padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
 
             else:
                 raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-            
+
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
 
         # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
+        if return_tensors == "tf" and is_tf_available():
             encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
             encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
 
             if "attention_mask" in encoded_inputs:
                 encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
 
-        elif return_tensors == 'pt' and is_torch_available():
+        elif return_tensors == "pt" and is_torch_available():
             encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
             encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
 
@@ -1137,11 +1209,15 @@ class PreTrainedTokenizer(object):
         elif return_tensors is not None:
             logger.warning(
                 "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors))
+                    return_tensors
+                )
+            )
 
         return encoded_inputs
 
-    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
+    def truncate_sequences(
+        self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
+    ):
         """Truncates a sequence pair in place to the maximum length.
             truncation_strategy: string selected in the following options:
                 - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
@@ -1154,7 +1230,7 @@ class PreTrainedTokenizer(object):
         if num_tokens_to_remove <= 0:
             return ids, pair_ids, []
 
-        if truncation_strategy == 'longest_first':
+        if truncation_strategy == "longest_first":
             overflowing_tokens = []
             for _ in range(num_tokens_to_remove):
                 if pair_ids is None or len(ids) > len(pair_ids):
@@ -1165,20 +1241,22 @@ class PreTrainedTokenizer(object):
             window_len = min(len(ids), stride)
             if window_len > 0:
                 overflowing_tokens = ids[-window_len:] + overflowing_tokens
-        elif truncation_strategy == 'only_first':
+        elif truncation_strategy == "only_first":
             assert len(ids) > num_tokens_to_remove
             window_len = min(len(ids), stride + num_tokens_to_remove)
             overflowing_tokens = ids[-window_len:]
             ids = ids[:-num_tokens_to_remove]
-        elif truncation_strategy == 'only_second':
+        elif truncation_strategy == "only_second":
             assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
             window_len = min(len(pair_ids), stride + num_tokens_to_remove)
             overflowing_tokens = pair_ids[-window_len:]
             pair_ids = pair_ids[:-num_tokens_to_remove]
-        elif truncation_strategy == 'do_not_truncate':
+        elif truncation_strategy == "do_not_truncate":
             raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
         else:
-            raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
+            raise ValueError(
+                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+            )
         return (ids, pair_ids, overflowing_tokens)
 
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
@@ -1246,7 +1324,7 @@ class PreTrainedTokenizer(object):
             The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
             but we often want to remove sub-word tokenization artifacts at the same time.
         """
-        return ' '.join(self.convert_ids_to_tokens(tokens))
+        return " ".join(self.convert_ids_to_tokens(tokens))
 
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """
@@ -1278,7 +1356,7 @@ class PreTrainedTokenizer(object):
                 current_sub_text.append(token)
         if current_sub_text:
             sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = ' '.join(sub_texts)
+        text = " ".join(sub_texts)
 
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
@@ -1323,7 +1401,17 @@ class PreTrainedTokenizer(object):
     def clean_up_tokenization(out_string):
         """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
         """
-        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" do not", " don't")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
         return out_string
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 8def80bec..9b96b92f2 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for XLM."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
@@ -32,386 +31,402 @@ from .tokenization_bert import BasicTokenizer
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.json',
-    'merges_file': 'merges.txt',
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
-        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
-        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
-        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
+    "vocab_file": {
+        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
+        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
+        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
+        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
+        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
+        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
+        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
+        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
     },
-    'merges_file':
-    {
-        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
-        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
+    "merges_file": {
+        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
+        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
+        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
+        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
+        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-mlm-en-2048': 512,
-    'xlm-mlm-ende-1024': 512,
-    'xlm-mlm-enfr-1024': 512,
-    'xlm-mlm-enro-1024': 512,
-    'xlm-mlm-tlm-xnli15-1024': 512,
-    'xlm-mlm-xnli15-1024': 512,
-    'xlm-clm-enfr-1024': 512,
-    'xlm-clm-ende-1024': 512,
-    'xlm-mlm-17-1280': 512,
-    'xlm-mlm-100-1280': 512,
+    "xlm-mlm-en-2048": 512,
+    "xlm-mlm-ende-1024": 512,
+    "xlm-mlm-enfr-1024": 512,
+    "xlm-mlm-enro-1024": 512,
+    "xlm-mlm-tlm-xnli15-1024": 512,
+    "xlm-mlm-xnli15-1024": 512,
+    "xlm-clm-enfr-1024": 512,
+    "xlm-clm-ende-1024": 512,
+    "xlm-mlm-17-1280": 512,
+    "xlm-mlm-100-1280": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
-    'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
-                            "id2lang": { "0": "de",
-                                        "1": "en"},
-                           "lang2id": { "de": 0,
-                                        "en": 1 }},
-    'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "fr"},
-                           "lang2id": { "en": 0,
-                                        "fr": 1 }},
-    'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "ro"},
-                           "lang2id": { "en": 0,
-                                        "ro": 1 }},
-    'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
-                                 "id2lang": {   "0": "ar",
-                                                "1": "bg",
-                                                "2": "de",
-                                                "3": "el",
-                                                "4": "en",
-                                                "5": "es",
-                                                "6": "fr",
-                                                "7": "hi",
-                                                "8": "ru",
-                                                "9": "sw",
-                                                "10": "th",
-                                                "11": "tr",
-                                                "12": "ur",
-                                                "13": "vi",
-                                                "14": "zh"},
-                                 "lang2id": {   "ar": 0,
-                                                "bg": 1,
-                                                "de": 2,
-                                                "el": 3,
-                                                "en": 4,
-                                                "es": 5,
-                                                "fr": 6,
-                                                "hi": 7,
-                                                "ru": 8,
-                                                "sw": 9,
-                                                "th": 10,
-                                                "tr": 11,
-                                                "ur": 12,
-                                                "vi": 13,
-                                                "zh": 14 }},
-    'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
-                             "id2lang": {   "0": "ar",
-                                                "1": "bg",
-                                                "2": "de",
-                                                "3": "el",
-                                                "4": "en",
-                                                "5": "es",
-                                                "6": "fr",
-                                                "7": "hi",
-                                                "8": "ru",
-                                                "9": "sw",
-                                                "10": "th",
-                                                "11": "tr",
-                                                "12": "ur",
-                                                "13": "vi",
-                                                "14": "zh"},
-                                 "lang2id": {   "ar": 0,
-                                                "bg": 1,
-                                                "de": 2,
-                                                "el": 3,
-                                                "en": 4,
-                                                "es": 5,
-                                                "fr": 6,
-                                                "hi": 7,
-                                                "ru": 8,
-                                                "sw": 9,
-                                                "th": 10,
-                                                "tr": 11,
-                                                "ur": 12,
-                                                "vi": 13,
-                                                "zh": 14 }},
-    'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "en",
-                                        "1": "fr"},
-                           "lang2id": { "en": 0,
-                                        "fr": 1 }},
-    'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
-                           "id2lang": { "0": "de",
-                                        "1": "en"},
-                           "lang2id": { "de": 0,
-                                        "en": 1 }},
-    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
-                        "id2lang": {
-                            "0": "ar",
-                            "1": "de",
-                            "2": "en",
-                            "3": "es",
-                            "4": "fr",
-                            "5": "hi",
-                            "6": "it",
-                            "7": "ja",
-                            "8": "ko",
-                            "9": "nl",
-                            "10": "pl",
-                            "11": "pt",
-                            "12": "ru",
-                            "13": "sv",
-                            "14": "tr",
-                            "15": "vi",
-                            "16": "zh"
-                        },
-                        "lang2id": {
-                            "ar": 0,
-                            "de": 1,
-                            "en": 2,
-                            "es": 3,
-                            "fr": 4,
-                            "hi": 5,
-                            "it": 6,
-                            "ja": 7,
-                            "ko": 8,
-                            "nl": 9,
-                            "pl": 10,
-                            "pt": 11,
-                            "ru": 12,
-                            "sv": 13,
-                            "tr": 14,
-                            "vi": 15,
-                            "zh": 16}},
-    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
-                        "id2lang": {
-                            "0": "af",
-                            "1": "als",
-                            "2": "am",
-                            "3": "an",
-                            "4": "ang",
-                            "5": "ar",
-                            "6": "arz",
-                            "7": "ast",
-                            "8": "az",
-                            "9": "bar",
-                            "10": "be",
-                            "11": "bg",
-                            "12": "bn",
-                            "13": "br",
-                            "14": "bs",
-                            "15": "ca",
-                            "16": "ceb",
-                            "17": "ckb",
-                            "18": "cs",
-                            "19": "cy",
-                            "20": "da",
-                            "21": "de",
-                            "22": "el",
-                            "23": "en",
-                            "24": "eo",
-                            "25": "es",
-                            "26": "et",
-                            "27": "eu",
-                            "28": "fa",
-                            "29": "fi",
-                            "30": "fr",
-                            "31": "fy",
-                            "32": "ga",
-                            "33": "gan",
-                            "34": "gl",
-                            "35": "gu",
-                            "36": "he",
-                            "37": "hi",
-                            "38": "hr",
-                            "39": "hu",
-                            "40": "hy",
-                            "41": "ia",
-                            "42": "id",
-                            "43": "is",
-                            "44": "it",
-                            "45": "ja",
-                            "46": "jv",
-                            "47": "ka",
-                            "48": "kk",
-                            "49": "kn",
-                            "50": "ko",
-                            "51": "ku",
-                            "52": "la",
-                            "53": "lb",
-                            "54": "lt",
-                            "55": "lv",
-                            "56": "mk",
-                            "57": "ml",
-                            "58": "mn",
-                            "59": "mr",
-                            "60": "ms",
-                            "61": "my",
-                            "62": "nds",
-                            "63": "ne",
-                            "64": "nl",
-                            "65": "nn",
-                            "66": "no",
-                            "67": "oc",
-                            "68": "pl",
-                            "69": "pt",
-                            "70": "ro",
-                            "71": "ru",
-                            "72": "scn",
-                            "73": "sco",
-                            "74": "sh",
-                            "75": "si",
-                            "76": "simple",
-                            "77": "sk",
-                            "78": "sl",
-                            "79": "sq",
-                            "80": "sr",
-                            "81": "sv",
-                            "82": "sw",
-                            "83": "ta",
-                            "84": "te",
-                            "85": "th",
-                            "86": "tl",
-                            "87": "tr",
-                            "88": "tt",
-                            "89": "uk",
-                            "90": "ur",
-                            "91": "uz",
-                            "92": "vi",
-                            "93": "war",
-                            "94": "wuu",
-                            "95": "yi",
-                            "96": "zh",
-                            "97": "zh_classical",
-                            "98": "zh_min_nan",
-                            "99": "zh_yue"
-                        },
-                        "lang2id": {
-                            "af": 0,
-                            "als": 1,
-                            "am": 2,
-                            "an": 3,
-                            "ang": 4,
-                            "ar": 5,
-                            "arz": 6,
-                            "ast": 7,
-                            "az": 8,
-                            "bar": 9,
-                            "be": 10,
-                            "bg": 11,
-                            "bn": 12,
-                            "br": 13,
-                            "bs": 14,
-                            "ca": 15,
-                            "ceb": 16,
-                            "ckb": 17,
-                            "cs": 18,
-                            "cy": 19,
-                            "da": 20,
-                            "de": 21,
-                            "el": 22,
-                            "en": 23,
-                            "eo": 24,
-                            "es": 25,
-                            "et": 26,
-                            "eu": 27,
-                            "fa": 28,
-                            "fi": 29,
-                            "fr": 30,
-                            "fy": 31,
-                            "ga": 32,
-                            "gan": 33,
-                            "gl": 34,
-                            "gu": 35,
-                            "he": 36,
-                            "hi": 37,
-                            "hr": 38,
-                            "hu": 39,
-                            "hy": 40,
-                            "ia": 41,
-                            "id": 42,
-                            "is": 43,
-                            "it": 44,
-                            "ja": 45,
-                            "jv": 46,
-                            "ka": 47,
-                            "kk": 48,
-                            "kn": 49,
-                            "ko": 50,
-                            "ku": 51,
-                            "la": 52,
-                            "lb": 53,
-                            "lt": 54,
-                            "lv": 55,
-                            "mk": 56,
-                            "ml": 57,
-                            "mn": 58,
-                            "mr": 59,
-                            "ms": 60,
-                            "my": 61,
-                            "nds": 62,
-                            "ne": 63,
-                            "nl": 64,
-                            "nn": 65,
-                            "no": 66,
-                            "oc": 67,
-                            "pl": 68,
-                            "pt": 69,
-                            "ro": 70,
-                            "ru": 71,
-                            "scn": 72,
-                            "sco": 73,
-                            "sh": 74,
-                            "si": 75,
-                            "simple": 76,
-                            "sk": 77,
-                            "sl": 78,
-                            "sq": 79,
-                            "sr": 80,
-                            "sv": 81,
-                            "sw": 82,
-                            "ta": 83,
-                            "te": 84,
-                            "th": 85,
-                            "tl": 86,
-                            "tr": 87,
-                            "tt": 88,
-                            "uk": 89,
-                            "ur": 90,
-                            "uz": 91,
-                            "vi": 92,
-                            "war": 93,
-                            "wuu": 94,
-                            "yi": 95,
-                            "zh": 96,
-                            "zh_classical": 97,
-                            "zh_min_nan": 98,
-                            "zh_yue": 99
-                        }},
+    "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
+    "xlm-mlm-ende-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "de", "1": "en"},
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-enfr-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "fr"},
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-mlm-enro-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "ro"},
+        "lang2id": {"en": 0, "ro": 1},
+    },
+    "xlm-mlm-tlm-xnli15-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {
+            "0": "ar",
+            "1": "bg",
+            "2": "de",
+            "3": "el",
+            "4": "en",
+            "5": "es",
+            "6": "fr",
+            "7": "hi",
+            "8": "ru",
+            "9": "sw",
+            "10": "th",
+            "11": "tr",
+            "12": "ur",
+            "13": "vi",
+            "14": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-mlm-xnli15-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {
+            "0": "ar",
+            "1": "bg",
+            "2": "de",
+            "3": "el",
+            "4": "en",
+            "5": "es",
+            "6": "fr",
+            "7": "hi",
+            "8": "ru",
+            "9": "sw",
+            "10": "th",
+            "11": "tr",
+            "12": "ur",
+            "13": "vi",
+            "14": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-clm-enfr-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "en", "1": "fr"},
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-clm-ende-1024": {
+        "do_lowercase_and_remove_accent": True,
+        "id2lang": {"0": "de", "1": "en"},
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-17-1280": {
+        "do_lowercase_and_remove_accent": False,
+        "id2lang": {
+            "0": "ar",
+            "1": "de",
+            "2": "en",
+            "3": "es",
+            "4": "fr",
+            "5": "hi",
+            "6": "it",
+            "7": "ja",
+            "8": "ko",
+            "9": "nl",
+            "10": "pl",
+            "11": "pt",
+            "12": "ru",
+            "13": "sv",
+            "14": "tr",
+            "15": "vi",
+            "16": "zh",
+        },
+        "lang2id": {
+            "ar": 0,
+            "de": 1,
+            "en": 2,
+            "es": 3,
+            "fr": 4,
+            "hi": 5,
+            "it": 6,
+            "ja": 7,
+            "ko": 8,
+            "nl": 9,
+            "pl": 10,
+            "pt": 11,
+            "ru": 12,
+            "sv": 13,
+            "tr": 14,
+            "vi": 15,
+            "zh": 16,
+        },
+    },
+    "xlm-mlm-100-1280": {
+        "do_lowercase_and_remove_accent": False,
+        "id2lang": {
+            "0": "af",
+            "1": "als",
+            "2": "am",
+            "3": "an",
+            "4": "ang",
+            "5": "ar",
+            "6": "arz",
+            "7": "ast",
+            "8": "az",
+            "9": "bar",
+            "10": "be",
+            "11": "bg",
+            "12": "bn",
+            "13": "br",
+            "14": "bs",
+            "15": "ca",
+            "16": "ceb",
+            "17": "ckb",
+            "18": "cs",
+            "19": "cy",
+            "20": "da",
+            "21": "de",
+            "22": "el",
+            "23": "en",
+            "24": "eo",
+            "25": "es",
+            "26": "et",
+            "27": "eu",
+            "28": "fa",
+            "29": "fi",
+            "30": "fr",
+            "31": "fy",
+            "32": "ga",
+            "33": "gan",
+            "34": "gl",
+            "35": "gu",
+            "36": "he",
+            "37": "hi",
+            "38": "hr",
+            "39": "hu",
+            "40": "hy",
+            "41": "ia",
+            "42": "id",
+            "43": "is",
+            "44": "it",
+            "45": "ja",
+            "46": "jv",
+            "47": "ka",
+            "48": "kk",
+            "49": "kn",
+            "50": "ko",
+            "51": "ku",
+            "52": "la",
+            "53": "lb",
+            "54": "lt",
+            "55": "lv",
+            "56": "mk",
+            "57": "ml",
+            "58": "mn",
+            "59": "mr",
+            "60": "ms",
+            "61": "my",
+            "62": "nds",
+            "63": "ne",
+            "64": "nl",
+            "65": "nn",
+            "66": "no",
+            "67": "oc",
+            "68": "pl",
+            "69": "pt",
+            "70": "ro",
+            "71": "ru",
+            "72": "scn",
+            "73": "sco",
+            "74": "sh",
+            "75": "si",
+            "76": "simple",
+            "77": "sk",
+            "78": "sl",
+            "79": "sq",
+            "80": "sr",
+            "81": "sv",
+            "82": "sw",
+            "83": "ta",
+            "84": "te",
+            "85": "th",
+            "86": "tl",
+            "87": "tr",
+            "88": "tt",
+            "89": "uk",
+            "90": "ur",
+            "91": "uz",
+            "92": "vi",
+            "93": "war",
+            "94": "wuu",
+            "95": "yi",
+            "96": "zh",
+            "97": "zh_classical",
+            "98": "zh_min_nan",
+            "99": "zh_yue",
+        },
+        "lang2id": {
+            "af": 0,
+            "als": 1,
+            "am": 2,
+            "an": 3,
+            "ang": 4,
+            "ar": 5,
+            "arz": 6,
+            "ast": 7,
+            "az": 8,
+            "bar": 9,
+            "be": 10,
+            "bg": 11,
+            "bn": 12,
+            "br": 13,
+            "bs": 14,
+            "ca": 15,
+            "ceb": 16,
+            "ckb": 17,
+            "cs": 18,
+            "cy": 19,
+            "da": 20,
+            "de": 21,
+            "el": 22,
+            "en": 23,
+            "eo": 24,
+            "es": 25,
+            "et": 26,
+            "eu": 27,
+            "fa": 28,
+            "fi": 29,
+            "fr": 30,
+            "fy": 31,
+            "ga": 32,
+            "gan": 33,
+            "gl": 34,
+            "gu": 35,
+            "he": 36,
+            "hi": 37,
+            "hr": 38,
+            "hu": 39,
+            "hy": 40,
+            "ia": 41,
+            "id": 42,
+            "is": 43,
+            "it": 44,
+            "ja": 45,
+            "jv": 46,
+            "ka": 47,
+            "kk": 48,
+            "kn": 49,
+            "ko": 50,
+            "ku": 51,
+            "la": 52,
+            "lb": 53,
+            "lt": 54,
+            "lv": 55,
+            "mk": 56,
+            "ml": 57,
+            "mn": 58,
+            "mr": 59,
+            "ms": 60,
+            "my": 61,
+            "nds": 62,
+            "ne": 63,
+            "nl": 64,
+            "nn": 65,
+            "no": 66,
+            "oc": 67,
+            "pl": 68,
+            "pt": 69,
+            "ro": 70,
+            "ru": 71,
+            "scn": 72,
+            "sco": 73,
+            "sh": 74,
+            "si": 75,
+            "simple": 76,
+            "sk": 77,
+            "sl": 78,
+            "sq": 79,
+            "sr": 80,
+            "sv": 81,
+            "sw": 82,
+            "ta": 83,
+            "te": 84,
+            "th": 85,
+            "tl": 86,
+            "tr": 87,
+            "tt": 88,
+            "uk": 89,
+            "ur": 90,
+            "uz": 91,
+            "vi": 92,
+            "war": 93,
+            "wuu": 94,
+            "yi": 95,
+            "zh": 96,
+            "zh_classical": 97,
+            "zh_min_nan": 98,
+            "zh_yue": 99,
+        },
+    },
 }
 
+
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -430,7 +445,7 @@ def lowercase_and_remove_accent(text):
     Lowercase and strips accents from a piece of text based on
     https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
     """
-    text = ' '.join(text)
+    text = " ".join(text)
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     output = []
@@ -439,73 +454,73 @@ def lowercase_and_remove_accent(text):
         if cat == "Mn":
             continue
         output.append(char)
-    return "".join(output).lower().split(' ')
+    return "".join(output).lower().split(" ")
 
 
 def replace_unicode_punct(text):
-    '''
+    """
     Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
-    '''
-    text = text.replace('，', ',')
-    text = re.sub(r'。\s*', '. ', text)
-    text = text.replace('、', ',')
-    text = text.replace('”', '"')
-    text = text.replace('“', '"')
-    text = text.replace('∶', ':')
-    text = text.replace('：', ':')
-    text = text.replace('？', '?')
-    text = text.replace('《', '"')
-    text = text.replace('》', '"')
-    text = text.replace('）', ')')
-    text = text.replace('！', '!')
-    text = text.replace('（', '(')
-    text = text.replace('；', ';')
-    text = text.replace('１', '"')
-    text = text.replace('」', '"')
-    text = text.replace('「', '"')
-    text = text.replace('０', '0')
-    text = text.replace('３', '3')
-    text = text.replace('２', '2')
-    text = text.replace('５', '5')
-    text = text.replace('６', '6')
-    text = text.replace('９', '9')
-    text = text.replace('７', '7')
-    text = text.replace('８', '8')
-    text = text.replace('４', '4')
-    text = re.sub(r'．\s*', '. ', text)
-    text = text.replace('～', '~')
-    text = text.replace('’', '\'')
-    text = text.replace('…', '...')
-    text = text.replace('━', '-')
-    text = text.replace('〈', '<')
-    text = text.replace('〉', '>')
-    text = text.replace('【', '[')
-    text = text.replace('】', ']')
-    text = text.replace('％', '%')
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", '"')
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
     return text
 
 
 def remove_non_printing_char(text):
-    '''
+    """
     Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
-    '''
+    """
     output = []
     for char in text:
         cat = unicodedata.category(char)
-        if cat.startswith('C'):
+        if cat.startswith("C"):
             continue
         output.append(char)
     return "".join(output)
 
 
 def romanian_preprocessing(text):
-    '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
+    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
     # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
     text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
     text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
     # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
-    text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
-    text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+    text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
     text = text.replace("\u0102", "A").replace("\u0103", "a")
     text = text.replace("\u00C2", "A").replace("\u00E2", "a")
     text = text.replace("\u00CE", "I").replace("\u00EE", "i")
@@ -531,33 +546,58 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
-                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
-                 mask_token="<special1>", additional_special_tokens=["<special0>",
-                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
-                 "<special6>", "<special7>", "<special8>", "<special9>"],
-                 lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
-                 **kwargs):
-        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
-                                           sep_token=sep_token, pad_token=pad_token,
-                                           cls_token=cls_token, mask_token=mask_token,
-                                           additional_special_tokens=additional_special_tokens,
-                                           **kwargs)
-
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        do_lowercase_and_remove_accent=True,
+        **kwargs
+    ):
+        super(XLMTokenizer, self).__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
         # cache of sm.MosesPunctNormalizer instance
         self.cache_moses_punct_normalizer = dict()
         # cache of sm.MosesTokenizer instance
         self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
+        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
         # True for current supported model (v1.2.0), False for XLM-17 & 100
         self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
         self.lang2id = lang2id
@@ -570,9 +610,9 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        with open(merges_file, encoding='utf-8') as merges_handle:
-            merges = merges_handle.read().split('\n')[:-1]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -603,9 +643,14 @@ class XLMTokenizer(PreTrainedTokenizer):
         if self.ja_word_tokenizer is None:
             try:
                 import Mykytea
-                self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~")
+                )
             except (AttributeError, ImportError) as e:
-                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
                 logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
                 logger.error("2. autoreconf -i")
                 logger.error("3. ./configure --prefix=$HOME/local")
@@ -619,16 +664,16 @@ class XLMTokenizer(PreTrainedTokenizer):
         return len(self.encoder)
 
     def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
         if token in self.cache:
             return self.cache[token]
         pairs = get_pairs(word)
 
         if not pairs:
-            return token+'</w>'
+            return token + "</w>"
 
         while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -643,8 +688,8 @@ class XLMTokenizer(PreTrainedTokenizer):
                     new_word.extend(word[i:])
                     break
 
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
                     i += 2
                 else:
                     new_word.append(word[i])
@@ -655,13 +700,13 @@ class XLMTokenizer(PreTrainedTokenizer):
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
-        if word == '\n  </w>':
-            word = '\n</w>'
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
         self.cache[token] = word
         return word
 
-    def _tokenize(self, text, lang='en', bypass_tokenizer=False):
+    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
         """
         Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
 
@@ -697,45 +742,49 @@ class XLMTokenizer(PreTrainedTokenizer):
             List of tokens.
         """
         if lang and self.lang2id and lang not in self.lang2id:
-            logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
+            )
         if bypass_tokenizer:
             text = text.split()
         elif lang not in self.lang_with_custom_tokenizer:
             text = self.moses_pipeline(text, lang=lang)
             # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
-            if lang == 'ro':
+            if lang == "ro":
                 text = romanian_preprocessing(text)
             text = self.moses_tokenize(text, lang=lang)
-        elif lang == 'th':
+        elif lang == "th":
             text = self.moses_pipeline(text, lang=lang)
             try:
-                if 'pythainlp' not in sys.modules:
+                if "pythainlp" not in sys.modules:
                     from pythainlp.tokenize import word_tokenize as th_word_tokenize
                 else:
-                    th_word_tokenize = sys.modules['pythainlp'].word_tokenize
+                    th_word_tokenize = sys.modules["pythainlp"].word_tokenize
             except (AttributeError, ImportError) as e:
-                logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
+                logger.error(
+                    "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
+                )
                 logger.error("1. pip install pythainlp")
                 raise e
             text = th_word_tokenize(text)
-        elif lang == 'zh':
+        elif lang == "zh":
             try:
-                if 'jieba' not in sys.modules:
+                if "jieba" not in sys.modules:
                     import jieba
                 else:
-                    jieba = sys.modules['jieba']
+                    jieba = sys.modules["jieba"]
             except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
                 logger.error("1. pip install jieba")
                 raise e
-            text = ' '.join(jieba.cut(text))
+            text = " ".join(jieba.cut(text))
             text = self.moses_pipeline(text, lang=lang)
             text = text.split()
-        elif lang == 'ja':
+        elif lang == "ja":
             text = self.moses_pipeline(text, lang=lang)
             text = self.ja_tokenize(text)
         else:
-            raise ValueError('It should not reach here')
+            raise ValueError("It should not reach here")
 
         if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
             text = lowercase_and_remove_accent(text)
@@ -743,7 +792,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         split_tokens = []
         for token in text:
             if token:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
 
         return split_tokens
 
@@ -757,7 +806,7 @@ class XLMTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        out_string = "".join(tokens).replace("</w>", " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -792,8 +841,10 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -820,20 +871,22 @@ class XLMTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(vocab_file, 'w', encoding='utf-8') as f:
+        with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
                     index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
+                writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
         return vocab_file, merge_file
diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index adbc8cd6c..30814c3a1 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for XLM-RoBERTa model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -26,29 +25,29 @@ from .tokenization_xlnet import SPIECE_UNDERLINE
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
-    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
-    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
+    "vocab_file": {
+        "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
+        "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
+        "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-roberta-base': 512,
-    'xlm-roberta-large': 512,
-    'xlm-roberta-large-finetuned-conll02-dutch': 512,
-    'xlm-roberta-large-finetuned-conll02-spanish': 512,
-    'xlm-roberta-large-finetuned-conll03-english': 512,
-    'xlm-roberta-large-finetuned-conll03-german': 512,
+    "xlm-roberta-base": 512,
+    "xlm-roberta-large": 512,
+    "xlm-roberta-large-finetuned-conll02-dutch": 512,
+    "xlm-roberta-large-finetuned-conll02-spanish": 512,
+    "xlm-roberta-large-finetuned-conll03-english": 512,
+    "xlm-roberta-large-finetuned-conll03-german": 512,
 }
 
+
 class XLMRobertaTokenizer(PreTrainedTokenizer):
     """
         Adapted from RobertaTokenizer and XLNetTokenizer
@@ -56,17 +55,33 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
-                 **kwargs):
-        super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                  mask_token=mask_token,
-                                                  **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        super(XLMRobertaTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs
+        )
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
         self.sp_model = spm.SentencePieceProcessor()
@@ -85,7 +100,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
         self.fairseq_offset = 1
 
-        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -119,8 +134,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         """
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is None:
@@ -164,7 +181,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -174,7 +191,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index a8369df67..8ea0ccb77 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for XLNet model."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
 import os
@@ -27,51 +26,69 @@ from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
-    {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    "vocab_file": {
+        "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
+        "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlnet-base-cased': None,
-    'xlnet-large-cased': None,
+    "xlnet-base-cased": None,
+    "xlnet-large-cased": None,
 }
 
-SPIECE_UNDERLINE = u'▁'
+SPIECE_UNDERLINE = "▁"
 
 # Segments (not really needed)
-SEG_ID_A   = 0
-SEG_ID_B   = 1
+SEG_ID_A = 0
+SEG_ID_B = 1
 SEG_ID_CLS = 2
 SEG_ID_SEP = 3
 SEG_ID_PAD = 4
 
+
 class XLNetTokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
+
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     padding_side = "left"
 
-    def __init__(self, vocab_file,
-                 do_lower_case=False, remove_space=True, keep_accents=False,
-                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
-                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
-                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
-        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
-                                             unk_token=unk_token, sep_token=sep_token,
-                                             pad_token=pad_token, cls_token=cls_token,
-                                             mask_token=mask_token, additional_special_tokens=
-                                             additional_special_tokens, **kwargs)
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        **kwargs
+    ):
+        super(XLNetTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs
+        )
 
         self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
         self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
@@ -80,8 +97,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
 
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
@@ -105,24 +124,26 @@ class XLNetTokenizer(PreTrainedTokenizer):
         try:
             import sentencepiece as spm
         except ImportError:
-            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                           "pip install sentencepiece")
+            logger.warning(
+                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
     def preprocess_text(self, inputs):
         if self.remove_space:
-            outputs = ' '.join(inputs.strip().split())
+            outputs = " ".join(inputs.strip().split())
         else:
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
         if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode('utf-8')
+            outputs = outputs.decode("utf-8")
 
         if not self.keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
         if self.do_lower_case:
             outputs = outputs.lower()
 
@@ -135,7 +156,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
         if six.PY2 and isinstance(text, unicode):
-            text = text.encode('utf-8')
+            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -143,9 +164,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(
-                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                     if len(cur_pieces[0]) == 1:
                         cur_pieces = cur_pieces[1:]
@@ -161,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             ret_pieces = []
             for piece in new_pieces:
                 if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
+                    piece = piece.decode("utf-8")
                 ret_pieces.append(piece)
             new_pieces = ret_pieces
 
@@ -175,12 +195,12 @@ class XLNetTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         token = self.sp_model.IdToPiece(index)
         if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode('utf-8')
+            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -215,8 +235,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         if already_has_special_tokens:
             if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
         if token_ids_1 is not None:
@@ -247,7 +269,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index de8cfa9e7..7262dd720 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -1,4 +1,4 @@
-''' Script for downloading all GLUE data.
+""" Script for downloading all GLUE data.
 Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
 
 Note: for legal reasons, we are unable to host MRPC.
@@ -16,7 +16,7 @@ rm MSRParaphraseCorpus.msi
 
 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
-'''
+"""
 
 import os
 import sys
@@ -27,20 +27,23 @@ import urllib.request
 import zipfile
 
 TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
-TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-             "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-             "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-             "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-             "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-             "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-             "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-             "QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
-             "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-             "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-             "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
-
-MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
-MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+TASK2PATH = {
+    "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",
+    "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8",
+    "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
+    "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5",
+    "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5",
+    "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce",
+    "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df",
+    "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601",
+    "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb",
+    "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf",
+    "diagnostic": "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D",
+}
+
+MRPC_TRAIN = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt"
+MRPC_TEST = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
+
 
 def download_and_extract(task, data_dir):
     print("Downloading and extracting %s..." % task)
@@ -51,6 +54,7 @@ def download_and_extract(task, data_dir):
     os.remove(data_file)
     print("\tCompleted!")
 
+
 def format_mrpc(data_dir, path_to_data):
     print("Processing MRPC...")
     mrpc_dir = os.path.join(data_dir, "MRPC")
@@ -72,30 +76,32 @@ def format_mrpc(data_dir, path_to_data):
     dev_ids = []
     with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
         for row in ids_fh:
-            dev_ids.append(row.strip().split('\t'))
+            dev_ids.append(row.strip().split("\t"))
 
-    with open(mrpc_train_file, encoding="utf8") as data_fh, \
-         open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
-         open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
+    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
+    ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
         header = data_fh.readline()
         train_fh.write(header)
         dev_fh.write(header)
         for row in data_fh:
-            label, id1, id2, s1, s2 = row.strip().split('\t')
+            label, id1, id2, s1, s2 = row.strip().split("\t")
             if [id1, id2] in dev_ids:
                 dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
             else:
                 train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
 
-    with open(mrpc_test_file, encoding="utf8") as data_fh, \
-            open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
+    with open(mrpc_test_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
+    ) as test_fh:
         header = data_fh.readline()
         test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
         for idx, row in enumerate(data_fh):
-            label, id1, id2, s1, s2 = row.strip().split('\t')
+            label, id1, id2, s1, s2 = row.strip().split("\t")
             test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
     print("\tCompleted!")
 
+
 def download_diagnostic(data_dir):
     print("Downloading and extracting diagnostic...")
     if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
@@ -105,8 +111,9 @@ def download_diagnostic(data_dir):
     print("\tCompleted!")
     return
 
+
 def get_tasks(task_names):
-    task_names = task_names.split(',')
+    task_names = task_names.split(",")
     if "all" in task_names:
         tasks = TASKS
     else:
@@ -116,13 +123,19 @@ def get_tasks(task_names):
             tasks.append(task_name)
     return tasks
 
+
 def main(arguments):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
-    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
-                        type=str, default='all')
-    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
-                        type=str, default='')
+    parser.add_argument("--data_dir", help="directory to save data to", type=str, default="glue_data")
+    parser.add_argument(
+        "--tasks", help="tasks to download data for as a comma separated string", type=str, default="all"
+    )
+    parser.add_argument(
+        "--path_to_mrpc",
+        help="path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt",
+        type=str,
+        default="",
+    )
     args = parser.parse_args(arguments)
 
     if not os.path.isdir(args.data_dir):
@@ -130,13 +143,13 @@ def main(arguments):
     tasks = get_tasks(args.tasks)
 
     for task in tasks:
-        if task == 'MRPC':
+        if task == "MRPC":
             format_mrpc(args.data_dir, args.path_to_mrpc)
-        elif task == 'diagnostic':
+        elif task == "diagnostic":
             download_diagnostic(args.data_dir)
         else:
             download_and_extract(task, args.data_dir)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))
diff --git a/utils/link_tester.py b/utils/link_tester.py
index fe3990d28..0ef165c40 100644
--- a/utils/link_tester.py
+++ b/utils/link_tester.py
@@ -43,7 +43,7 @@ def scan_code_for_links(source):
     """ Scans the file to find links using a regular expression.
     Returns a list of links.
     """
-    with open(source, 'r') as content:
+    with open(source, "r") as content:
         content = content.read()
         raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
         links = [prefix + suffix for _, prefix, suffix in raw_links]
-- 
GitLab


From 6e5291a9155a449e5f367a0bd057f3649f3532ec Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:49:11 +0100
Subject: [PATCH 03/32] Enforce black in CI.

---
 .circleci/config.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bfa3b943a..500941548 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -86,6 +86,16 @@ jobs:
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
             - run: ./.circleci/deploy.sh
+    check_code_quality:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install black
+            - run: black --check --line-length 119 examples templates transformers utils
     check_repository_consistency:
         working_directory: ~/transformers
         docker:
@@ -105,6 +115,7 @@ workflows:
     version: 2
     build_and_test:
         jobs:
+            - check_code_quality
             - check_repository_consistency
             - run_examples_py3_torch
             - run_tests_py3_custom_tokenizers
-- 
GitLab


From 36883c1192248e04d59c4d0a7c6887d03ddfd5f2 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:50:39 +0100
Subject: [PATCH 04/32] Add "make style" to format code with black.

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..afc197c56
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+.PHONY: style
+
+style:
+	black --line-length 119 examples templates transformers utils
-- 
GitLab


From bc1715c1e0872187a8b76d2f258d43815dcf6067 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:56:44 +0100
Subject: [PATCH 05/32] Add black-compatible isort configuration.

lines_after_imports = 2 is a matter of taste; I like it.
---
 setup.cfg | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 setup.cfg

diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..326d97a81
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,9 @@
+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = transformers
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
-- 
GitLab


From 158e82e061c02fc2f1613adb7ac1d1cb6adae71c Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:57:32 +0100
Subject: [PATCH 06/32] Sort imports with isort.

This is the result of:

    $ isort --recursive examples templates transformers utils hubconf.py setup.py
---
 examples/benchmarks.py                        |  11 +-
 examples/contrib/run_camembert.py             |   4 +-
 examples/contrib/run_openai_gpt.py            |  13 +-
 examples/contrib/run_swag.py                  |  20 ++-
 examples/contrib/run_transfo_xl.py            |   5 +-
 examples/distillation/distiller.py            |  22 +--
 .../distillation/grouped_batch_sampler.py     |   2 +-
 examples/distillation/lm_seqs_dataset.py      |   2 +-
 .../distillation/run_squad_w_distillation.py  |  38 ++--
 .../distillation/scripts/binarized_data.py    |   7 +-
 examples/distillation/scripts/extract.py      |   7 +-
 .../scripts/extract_distilbert.py             |   7 +-
 examples/distillation/scripts/token_counts.py |   5 +-
 examples/distillation/train.py                |  26 ++-
 examples/distillation/utils.py                |   8 +-
 examples/mm-imdb/run_mmimdb.py                |  38 ++--
 examples/mm-imdb/utils_mmimdb.py              |   6 +-
 examples/pplm/run_pplm.py                     |   3 +-
 examples/pplm/run_pplm_discrim_train.py       |   8 +-
 examples/run_bertology.py                     |  16 +-
 examples/run_generation.py                    |  22 ++-
 examples/run_glue.py                          |  40 ++---
 examples/run_lm_finetuning.py                 |  28 +--
 examples/run_multiple_choice.py               |  23 ++-
 examples/run_ner.py                           |  29 ++-
 examples/run_squad.py                         |  48 ++---
 examples/run_tf_glue.py                       |   7 +-
 examples/run_tf_ner.py                        |  36 ++--
 examples/run_xnli.py                          |  25 ++-
 ...ert_bertabs_original_pytorch_checkpoint.py |   6 +-
 examples/summarization/modeling_bertabs.py    |   3 +-
 examples/summarization/run_summarization.py   |  11 +-
 examples/summarization/utils_summarization.py |   2 +-
 .../summarization/utils_summarization_test.py |   7 +-
 examples/test_examples.py                     |  16 +-
 examples/utils_multiple_choice.py             |  11 +-
 examples/utils_ner.py                         |   1 +
 hubconf.py                                    |   7 +-
 setup.py                                      |   1 +
 .../adding_a_new_example_script/run_xxx.py    |  35 ++--
 .../adding_a_new_example_script/utils_xxx.py  |   5 +-
 .../adding_a_new_model/configuration_xxx.py   |   4 +-
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |   7 +-
 .../adding_a_new_model/modeling_tf_xxx.py     |   7 +-
 templates/adding_a_new_model/modeling_xxx.py  |   7 +-
 .../tests/modeling_tf_xxx_test.py             |  11 +-
 .../tests/modeling_xxx_test.py                |   7 +-
 .../tests/tokenization_xxx_test.py            |   2 +-
 .../adding_a_new_model/tokenization_xxx.py    |   1 +
 transformers/__init__.py                      | 166 +++++++++---------
 transformers/commands/convert.py              |   1 -
 transformers/commands/run.py                  |   2 +-
 transformers/commands/serving.py              |  11 +-
 transformers/commands/train.py                |   9 +-
 transformers/commands/user.py                 |   2 +-
 transformers/configuration_albert.py          |   1 +
 transformers/configuration_auto.py            |  27 +--
 transformers/configuration_bert.py            |   1 +
 transformers/configuration_camembert.py       |   1 +
 transformers/configuration_ctrl.py            |   1 +
 transformers/configuration_distilbert.py      |   3 +-
 transformers/configuration_gpt2.py            |   1 +
 transformers/configuration_mmbt.py            |   1 +
 transformers/configuration_openai.py          |   1 +
 transformers/configuration_roberta.py         |   1 +
 transformers/configuration_t5.py              |   4 +-
 transformers/configuration_transfo_xl.py      |   1 +
 transformers/configuration_utils.py           |   3 +-
 transformers/configuration_xlm.py             |   1 +
 transformers/configuration_xlm_roberta.py     |   1 +
 transformers/configuration_xlnet.py           |   1 +
 ...lbert_original_tf_checkpoint_to_pytorch.py |   7 +-
 ..._bert_original_tf_checkpoint_to_pytorch.py |   7 +-
 ..._bert_pytorch_checkpoint_to_original_tf.py |   6 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |   2 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |   2 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |  75 ++++----
 ..._original_pytorch_checkpoint_to_pytorch.py |  14 +-
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |   7 +-
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |  13 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   4 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  12 +-
 transformers/data/__init__.py                 |  20 ++-
 transformers/data/metrics/__init__.py         |   3 +-
 transformers/data/metrics/squad_metrics.py    |   8 +-
 transformers/data/processors/__init__.py      |   6 +-
 transformers/data/processors/glue.py          |   3 +-
 transformers/data/processors/squad.py         |  13 +-
 transformers/data/processors/utils.py         |   5 +-
 transformers/data/processors/xnli.py          |   1 +
 transformers/file_utils.py                    |  13 +-
 transformers/hf_api.py                        |   1 +
 transformers/modelcard.py                     |   5 +-
 transformers/modeling_albert.py               |  12 +-
 transformers/modeling_auto.py                 | 104 ++++++-----
 transformers/modeling_bert.py                 |   3 +-
 transformers/modeling_camembert.py            |   9 +-
 transformers/modeling_ctrl.py                 |   4 +-
 transformers/modeling_distilbert.py           |   8 +-
 transformers/modeling_encoder_decoder.py      |   1 +
 transformers/modeling_gpt2.py                 |   3 +-
 transformers/modeling_mmbt.py                 |   1 +
 transformers/modeling_openai.py               |   3 +-
 transformers/modeling_roberta.py              |   3 +-
 transformers/modeling_t5.py                   |  11 +-
 transformers/modeling_tf_albert.py            |   5 +-
 transformers/modeling_tf_auto.py              |  83 +++++----
 transformers/modeling_tf_bert.py              |   3 +-
 transformers/modeling_tf_ctrl.py              |   4 +-
 transformers/modeling_tf_distilbert.py        |   8 +-
 transformers/modeling_tf_gpt2.py              |  11 +-
 transformers/modeling_tf_openai.py            |  11 +-
 transformers/modeling_tf_pytorch_utils.py     |   2 +
 transformers/modeling_tf_roberta.py           |   4 +-
 transformers/modeling_tf_t5.py                |   7 +-
 transformers/modeling_tf_transfo_xl.py        |  11 +-
 .../modeling_tf_transfo_xl_utilities.py       |   1 -
 transformers/modeling_tf_utils.py             |   5 +-
 transformers/modeling_tf_xlm.py               |  11 +-
 transformers/modeling_tf_xlnet.py             |   2 +-
 transformers/modeling_transfo_xl.py           |  11 +-
 transformers/modeling_transfo_xl_utilities.py |   2 +-
 transformers/modeling_utils.py                |   3 +-
 transformers/modeling_xlm.py                  |   8 +-
 transformers/modeling_xlm_roberta.py          |   9 +-
 transformers/modeling_xlnet.py                |  12 +-
 transformers/optimization.py                  |   1 +
 transformers/optimization_tf.py               |   4 +-
 transformers/pipelines.py                     |  20 +--
 .../tests/configuration_common_test.py        |   8 +-
 transformers/tests/hf_api_test.py             |   1 +
 transformers/tests/model_card_test.py         |   3 +-
 transformers/tests/modeling_albert_test.py    |   7 +-
 transformers/tests/modeling_auto_test.py      |  11 +-
 transformers/tests/modeling_bert_test.py      |   7 +-
 transformers/tests/modeling_common_test.py    |  16 +-
 transformers/tests/modeling_ctrl_test.py      |  15 +-
 .../tests/modeling_distilbert_test.py         |  13 +-
 .../tests/modeling_encoder_decoder_test.py    |   2 +
 transformers/tests/modeling_gpt2_test.py      |  13 +-
 transformers/tests/modeling_openai_test.py    |  13 +-
 transformers/tests/modeling_roberta_test.py   |  13 +-
 transformers/tests/modeling_t5_test.py        |   7 +-
 transformers/tests/modeling_tf_albert_test.py |  11 +-
 transformers/tests/modeling_tf_auto_test.py   |  11 +-
 transformers/tests/modeling_tf_bert_test.py   |  11 +-
 transformers/tests/modeling_tf_common_test.py |  10 +-
 transformers/tests/modeling_tf_ctrl_test.py   |  11 +-
 .../tests/modeling_tf_distilbert_test.py      |   9 +-
 transformers/tests/modeling_tf_gpt2_test.py   |  11 +-
 .../tests/modeling_tf_openai_gpt_test.py      |  11 +-
 .../tests/modeling_tf_roberta_test.py         |   9 +-
 transformers/tests/modeling_tf_t5_test.py     |  11 +-
 .../tests/modeling_tf_transfo_xl_test.py      |  11 +-
 transformers/tests/modeling_tf_xlm_test.py    |  13 +-
 transformers/tests/modeling_tf_xlnet_test.py  |  17 +-
 .../tests/modeling_transfo_xl_test.py         |  15 +-
 transformers/tests/modeling_xlm_test.py       |  13 +-
 transformers/tests/modeling_xlnet_test.py     |  17 +-
 transformers/tests/optimization_test.py       |  13 +-
 transformers/tests/optimization_tf_test.py    |   5 +-
 transformers/tests/pipelines_test.py          |   2 +-
 .../tests/tokenization_albert_test.py         |   3 +-
 transformers/tests/tokenization_auto_test.py  |  19 +-
 .../tests/tokenization_bert_japanese_test.py  |   6 +-
 transformers/tests/tokenization_bert_test.py  |   2 +-
 transformers/tests/tokenization_ctrl_test.py  |   4 +-
 .../tests/tokenization_distilbert_test.py     |   2 +-
 transformers/tests/tokenization_gpt2_test.py  |   4 +-
 .../tests/tokenization_openai_test.py         |   4 +-
 .../tests/tokenization_roberta_test.py        |   5 +-
 transformers/tests/tokenization_t5_test.py    |   1 +
 .../tests/tokenization_tests_commons.py       |   5 +-
 .../tests/tokenization_transfo_xl_test.py     |   7 +-
 transformers/tests/tokenization_utils_test.py |   5 +-
 transformers/tests/tokenization_xlm_test.py   |   4 +-
 transformers/tests/tokenization_xlnet_test.py |   3 +-
 transformers/tests/utils.py                   |   3 +-
 transformers/tokenization_albert.py           |   9 +-
 transformers/tokenization_auto.py             |  17 +-
 transformers/tokenization_bert.py             |   1 +
 transformers/tokenization_bert_japanese.py    |   6 +-
 transformers/tokenization_camembert.py        |   3 +
 transformers/tokenization_ctrl.py             |   4 +-
 transformers/tokenization_distilbert.py       |   1 +
 transformers/tokenization_gpt2.py             |  10 +-
 transformers/tokenization_openai.py           |   3 +-
 transformers/tokenization_roberta.py          |   6 +-
 transformers/tokenization_t5.py               |   4 +-
 transformers/tokenization_transfo_xl.py       |   1 +
 transformers/tokenization_utils.py            |  12 +-
 transformers/tokenization_xlm.py              |   3 +-
 transformers/tokenization_xlm_roberta.py      |   3 +
 transformers/tokenization_xlnet.py            |   3 +-
 utils/download_glue_data.py                   |   5 +-
 195 files changed, 1085 insertions(+), 947 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 20b62112b..4ef0640e3 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -18,12 +18,14 @@
 # If checking the tensors placement
 # tf.debugging.set_log_device_placement(True)
 
-from typing import List
-import timeit
-from transformers import is_tf_available, is_torch_available
-from time import time
 import argparse
 import csv
+import timeit
+from time import time
+from typing import List
+
+from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
+
 
 if is_tf_available():
     import tensorflow as tf
@@ -33,7 +35,6 @@ if is_torch_available():
     import torch
     from transformers import AutoModel
 
-from transformers import AutoConfig, AutoTokenizer
 
 input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as 
 the Director of Hatcheries and Conditioning entered the room, in the 
diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
index 99f54f544..791a02fed 100644
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -1,11 +1,11 @@
-from pathlib import Path
 import tarfile
 import urllib.request
+from pathlib import Path
 
 import torch
 
-from transformers.tokenization_camembert import CamembertTokenizer
 from transformers.modeling_camembert import CamembertForMaskedLM
+from transformers.tokenization_camembert import CamembertTokenizer
 
 
 def fill_mask(masked_input, model, tokenizer, topk=5):
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index f6431c80b..e35f3d4fe 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -28,26 +28,27 @@
           --train_batch_size 16 \
 """
 import argparse
-import os
 import csv
-import random
 import logging
-from tqdm import tqdm, trange
+import os
+import random
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from tqdm import tqdm, trange
 
 from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    AdamW,
     OpenAIGPTDoubleHeadsModel,
     OpenAIGPTTokenizer,
-    AdamW,
     cached_path,
-    WEIGHTS_NAME,
-    CONFIG_NAME,
     get_linear_schedule_with_warmup,
 )
 
+
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
 logging.basicConfig(
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index d03d1aace..65c07c2a3 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -19,28 +19,34 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import logging
 import csv
+import glob
+import logging
 import os
 import random
 import sys
-import glob
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    get_linear_schedule_with_warmup,
+)
+
 
 try:
     from torch.utils.tensorboard import SummaryWriter
 except:
     from tensorboardX import SummaryWriter
 
-from tqdm import tqdm, trange
-
-from transformers import WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer
-
-from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index 1ef66bef1..e4af4f6db 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -23,12 +23,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import argparse
 import logging
-import time
 import math
+import time
 
 import torch
 
-from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLCorpus, TransfoXLLMHeadModel, TransfoXLTokenizer
+
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index e3bf0d443..a957b1a09 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -15,31 +15,31 @@
 """ The distiller to distil the student.
     Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-import os
 import math
-import psutil
+import os
 import time
-from tqdm import trange, tqdm
-import numpy as np
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
+from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data import RandomSampler, BatchSampler, DataLoader
+from tqdm import tqdm, trange
+
+import psutil
+from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
+from lm_seqs_dataset import LmSeqsDataset
+from transformers import get_linear_schedule_with_warmup
+from utils import logger
+
 
 try:
     from torch.utils.tensorboard import SummaryWriter
 except:
     from tensorboardX import SummaryWriter
 
-from transformers import get_linear_schedule_with_warmup
-
-from utils import logger
-from lm_seqs_dataset import LmSeqsDataset
-from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
-
 
 class Distiller:
     def __init__(
diff --git a/examples/distillation/grouped_batch_sampler.py b/examples/distillation/grouped_batch_sampler.py
index 1132fdb58..c386c4224 100644
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -17,8 +17,8 @@
 import bisect
 import copy
 from collections import defaultdict
-import numpy as np
 
+import numpy as np
 from torch.utils.data.sampler import BatchSampler, Sampler
 
 from utils import logger
diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index bb0d80f38..691e010cf 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -15,10 +15,10 @@
 """ Dataset to distilled models
     adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
+import numpy as np
 import torch
 from torch.utils.data import Dataset
 
-import numpy as np
 from utils import logger
 
 
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 0d5a004eb..11524e388 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -18,56 +18,58 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import glob
 import logging
 import os
 import random
-import glob
 
 import numpy as np
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-import torch.nn.functional as F
-import torch.nn as nn
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
     BertConfig,
     BertForQuestionAnswering,
     BertTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
     XLMConfig,
     XLMForQuestionAnswering,
     XLMTokenizer,
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
+    get_linear_schedule_with_warmup,
 )
 
-from transformers import AdamW, get_linear_schedule_with_warmup
-
 from ..utils_squad import (
-    read_squad_examples,
-    convert_examples_to_features,
     RawResult,
-    write_predictions,
     RawResultExtended,
+    convert_examples_to_features,
+    read_squad_examples,
+    write_predictions,
     write_predictions_extended,
 )
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)
-from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+from ..utils_squad_evaluate import EVAL_OPTS
+from ..utils_squad_evaluate import main as evaluate_on_squad
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 40bde7d15..7590cfcbc 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -16,12 +16,15 @@
 Preprocessing script before distillation.
 """
 import argparse
+import logging
 import pickle
 import random
 import time
+
 import numpy as np
-from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
-import logging
+
+from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
+
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index 9610f8f17..429350a77 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -16,10 +16,13 @@
 Preprocessing script before training the distilled model.
 Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
 """
-from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
-import torch
 import argparse
 
+import torch
+
+from transformers import BertForMaskedLM, GPT2LMHeadModel, RobertaForMaskedLM
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py
index 8e58db555..db0dc3ed8 100644
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -16,10 +16,13 @@
 Preprocessing script before training DistilBERT.
 Specific to BERT -> DistilBERT.
 """
-from transformers import BertForMaskedLM, RobertaForMaskedLM
-import torch
 import argparse
 
+import torch
+
+from transformers import BertForMaskedLM, RobertaForMaskedLM
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index 623caad4b..0238bf66f 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -15,10 +15,11 @@
 """
 Preprocessing script before training the distilled model.
 """
-from collections import Counter
 import argparse
-import pickle
 import logging
+import pickle
+from collections import Counter
+
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 37c49ae7b..a37a7c427 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -16,22 +16,32 @@
 Training the distilled model.
 Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
-import os
 import argparse
-import pickle
 import json
+import os
+import pickle
 import shutil
+
 import numpy as np
 import torch
 
-from transformers import BertConfig, BertForMaskedLM, BertTokenizer
-from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
-from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
-from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-
 from distiller import Distiller
-from utils import git_log, logger, init_gpu_params, set_seed
 from lm_seqs_dataset import LmSeqsDataset
+from transformers import (
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+)
+from utils import git_log, init_gpu_params, logger, set_seed
 
 
 MODEL_CLASSES = {
diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
index f9d7412cb..b081f239c 100644
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -15,14 +15,16 @@
 """ Utils to train DistilBERT
     adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-import git
 import json
+import logging
 import os
 import socket
-import torch
+
 import numpy as np
+import torch
+
+import git
 
-import logging
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index c92dbd3d3..e87555f7d 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -19,32 +19,33 @@ from __future__ import absolute_import, division, print_function
 
 import argparse
 import glob
+import json
 import logging
 import os
 import random
-import json
-from sklearn.metrics import f1_score
 
 import numpy as np
 import torch
 import torch.nn as nn
+from sklearn.metrics import f1_score
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
-from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_mmimdb_labels, get_image_transforms
-
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
     BertConfig,
     BertModel,
     BertTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    MMBTConfig,
+    MMBTForClassification,
     RobertaConfig,
     RobertaModel,
     RobertaTokenizer,
@@ -54,17 +55,16 @@ from transformers import (
     XLNetConfig,
     XLNetModel,
     XLNetTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    MMBTForClassification,
-    MMBTConfig,
+    get_linear_schedule_with_warmup,
 )
+from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
 
-from transformers import AdamW, get_linear_schedule_with_warmup
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py
index 57cee25f9..7a52a99b1 100644
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -17,13 +17,15 @@
 import json
 import os
 from collections import Counter
-from PIL import Image
 
 import torch
 import torch.nn as nn
+from torch.utils.data import Dataset
+
 import torchvision
 import torchvision.transforms as transforms
-from torch.utils.data import Dataset
+from PIL import Image
+
 
 POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
 
diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py
index 37183a512..ec848323e 100644
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -34,10 +34,11 @@ import torch.nn.functional as F
 from torch.autograd import Variable
 from tqdm import trange
 
+from pplm_classification_head import ClassificationHead
 from transformers import GPT2Tokenizer
 from transformers.file_utils import cached_path
 from transformers.modeling_gpt2 import GPT2LMHeadModel
-from pplm_classification_head import ClassificationHead
+
 
 PPLM_BOW = 1
 PPLM_DISCRIM = 2
diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py
index 14136c4c7..287715e53 100644
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -24,16 +24,16 @@ import time
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.optim
 import torch.optim as optim
 import torch.utils.data as data
+from tqdm import tqdm, trange
+
 from nltk.tokenize.treebank import TreebankWordDetokenizer
+from pplm_classification_head import ClassificationHead
 from torchtext import data as torchtext_data
 from torchtext import datasets
-from tqdm import tqdm, trange
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-from pplm_classification_head import ClassificationHead
 
 torch.manual_seed(0)
 np.random.seed(0)
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 6b4739d6b..27709fa7e 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -19,19 +19,19 @@
     Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
     which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
 """
-import os
 import argparse
 import logging
-from datetime import timedelta, datetime
-from tqdm import tqdm
+import os
+from datetime import datetime, timedelta
 
 import numpy as np
-
 import torch
-from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
-from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
+from torch.utils.data import DataLoader, SequentialSampler, Subset, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
 
+from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
 from transformers import (
     WEIGHTS_NAME,
     BertConfig,
@@ -44,13 +44,11 @@ from transformers import (
     XLNetForSequenceClassification,
     XLNetTokenizer,
 )
-
-from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
-
 from transformers import glue_compute_metrics as compute_metrics
 from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/examples/run_generation.py b/examples/run_generation.py
index e62ccf87c..629b9348a 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -21,15 +21,23 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import argparse
 import logging
 
-import torch
 import numpy as np
+import torch
 
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from transformers import XLNetLMHeadModel, XLNetTokenizer
-from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
-from transformers import CTRLLMHeadModel, CTRLTokenizer
-from transformers import XLMWithLMHeadModel, XLMTokenizer
+from transformers import (
+    CTRLLMHeadModel,
+    CTRLTokenizer,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    TransfoXLLMHeadModel,
+    TransfoXLTokenizer,
+    XLMTokenizer,
+    XLMWithLMHeadModel,
+    XLNetLMHeadModel,
+    XLNetTokenizer,
+)
 
 
 logging.basicConfig(
diff --git a/examples/run_glue.py b/examples/run_glue.py
index bbfd52ea3..d70e20f33 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -19,54 +19,54 @@ from __future__ import absolute_import, division, print_function
 
 import argparse
 import glob
+import json
 import logging
 import os
 import random
-import json
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
     BertConfig,
     BertForSequenceClassification,
     BertTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
     RobertaConfig,
     RobertaForSequenceClassification,
     RobertaTokenizer,
     XLMConfig,
     XLMForSequenceClassification,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
     XLMTokenizer,
     XLNetConfig,
     XLNetForSequenceClassification,
     XLNetTokenizer,
-    DistilBertConfig,
-    DistilBertForSequenceClassification,
-    DistilBertTokenizer,
-    AlbertConfig,
-    AlbertForSequenceClassification,
-    AlbertTokenizer,
-    XLMRobertaConfig,
-    XLMRobertaForSequenceClassification,
-    XLMRobertaTokenizer,
+    get_linear_schedule_with_warmup,
 )
-
-from transformers import AdamW, get_linear_schedule_with_warmup
-
 from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
 from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 60b99f29d..f916897d0 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -32,23 +32,22 @@ import shutil
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
-    get_linear_schedule_with_warmup,
     BertConfig,
     BertForMaskedLM,
     BertTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
     GPT2Config,
     GPT2LMHeadModel,
     GPT2Tokenizer,
@@ -58,15 +57,16 @@ from transformers import (
     RobertaConfig,
     RobertaForMaskedLM,
     RobertaTokenizer,
-    DistilBertConfig,
-    DistilBertForMaskedLM,
-    DistilBertTokenizer,
-    CamembertConfig,
-    CamembertForMaskedLM,
-    CamembertTokenizer,
+    get_linear_schedule_with_warmup,
 )
 
 
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index bfa62cfb7..19ca558ca 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -23,35 +23,34 @@ import logging
 import os
 import random
 
-
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
     BertConfig,
     BertForMultipleChoice,
     BertTokenizer,
-    XLNetConfig,
-    XLNetForMultipleChoice,
-    XLNetTokenizer,
     RobertaConfig,
     RobertaForMultipleChoice,
     RobertaTokenizer,
+    XLNetConfig,
+    XLNetForMultipleChoice,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
 )
+from utils_multiple_choice import convert_examples_to_features, processors
 
-from transformers import AdamW, get_linear_schedule_with_warmup
 
-from utils_multiple_choice import convert_examples_to_features, processors
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 48ac61b4f..8d991555a 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -25,20 +25,35 @@ import random
 
 import numpy as np
 import torch
-from seqeval.metrics import precision_score, recall_score, f1_score
 from tensorboardX import SummaryWriter
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
+
+from seqeval.metrics import f1_score, precision_score, recall_score
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForTokenClassification,
+    BertTokenizer,
+    CamembertConfig,
+    CamembertForTokenClassification,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForTokenClassification,
+    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForTokenClassification,
+    RobertaTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaForTokenClassification,
+    XLMRobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
 
-from transformers import AdamW, get_linear_schedule_with_warmup
-from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
-from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
-from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
-from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
-from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 1580a31e8..9dbc39cb7 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,57 +16,57 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
-from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_logits,
-    compute_predictions_log_probs,
-    squad_evaluate,
-)
 
 import argparse
+import glob
 import logging
 import os
 import random
-import glob
 import timeit
+
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForQuestionAnswering,
+    AlbertTokenizer,
     BertConfig,
     BertForQuestionAnswering,
     BertTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
+    RobertaConfig,
     RobertaForQuestionAnswering,
     RobertaTokenizer,
-    RobertaConfig,
     XLMConfig,
     XLMForQuestionAnswering,
     XLMTokenizer,
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
-    AlbertConfig,
-    AlbertForQuestionAnswering,
-    AlbertTokenizer,
-    XLMConfig,
-    XLMForQuestionAnswering,
-    XLMTokenizer,
+    get_linear_schedule_with_warmup,
+    squad_convert_examples_to_features,
 )
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_log_probs,
+    compute_predictions_logits,
+    squad_evaluate,
+)
+from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
 
-from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 74a6db34a..511a98e94 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,15 +1,18 @@
 import os
+
 import tensorflow as tf
+
 import tensorflow_datasets
 from transformers import (
+    BertConfig,
+    BertForSequenceClassification,
     BertTokenizer,
     TFBertForSequenceClassification,
-    BertConfig,
     glue_convert_examples_to_features,
-    BertForSequenceClassification,
     glue_processors,
 )
 
+
 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index 77850d1ab..68c4b15a0 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -1,23 +1,33 @@
 # coding=utf-8
+import _pickle as pickle
+import collections
 import datetime
-import os
-import math
 import glob
+import math
+import os
 import re
-import tensorflow as tf
-import collections
+
 import numpy as np
+import tensorflow as tf
+from absl import app, flags, logging
+
+from fastprogress import master_bar, progress_bar
 from seqeval import metrics
-import _pickle as pickle
-from absl import logging
-from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
-from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
-from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
-from transformers import create_optimizer, GradientAccumulator
+from transformers import (
+    TF2_WEIGHTS_NAME,
+    BertConfig,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertTokenizer,
+    GradientAccumulator,
+    RobertaConfig,
+    RobertaTokenizer,
+    TFBertForTokenClassification,
+    TFDistilBertForTokenClassification,
+    TFRobertaForTokenClassification,
+    create_optimizer,
+)
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
-from fastprogress import master_bar, progress_bar
-from absl import flags
-from absl import app
 
 
 ALL_MODELS = sum(
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index 9faba294d..bc1789f1d 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -28,34 +28,33 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
     BertConfig,
     BertForSequenceClassification,
     BertTokenizer,
-    XLMConfig,
-    XLMForSequenceClassification,
-    XLMTokenizer,
     DistilBertConfig,
     DistilBertForSequenceClassification,
     DistilBertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    get_linear_schedule_with_warmup,
 )
-
-from transformers import AdamW, get_linear_schedule_with_warmup
-
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
 from transformers import xnli_compute_metrics as compute_metrics
 from transformers import xnli_output_modes as output_modes
 from transformers import xnli_processors as processors
 
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
index d32e6fc06..a1cbd64dd 100644
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -20,13 +20,13 @@ the model within the original codebase to be able to only save its `state_dict`.
 """
 
 import argparse
-from collections import namedtuple
 import logging
+from collections import namedtuple
+
 import torch
 
-from models.model_builder import AbsSummarizer  # The authors' implementation
 from model_bertabs import BertAbsSummarizer
-
+from models.model_builder import AbsSummarizer  # The authors' implementation
 from transformers import BertTokenizer
 
 
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index d4d8c6648..e8087f300 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -27,9 +27,8 @@ import torch
 from torch import nn
 from torch.nn.init import xavier_uniform_
 
-from transformers import BertModel, BertConfig, PreTrainedModel
-
 from configuration_bertabs import BertAbsConfig
+from transformers import BertConfig, BertModel, PreTrainedModel
 
 
 MAX_SIZE = 5000
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 36210d999..1917ca30b 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -1,26 +1,25 @@
 #! /usr/bin/python3
 import argparse
-from collections import namedtuple
 import logging
 import os
 import sys
+from collections import namedtuple
 
 import torch
 from torch.utils.data import DataLoader, SequentialSampler
 from tqdm import tqdm
 
-from transformers import BertTokenizer
-
 from modeling_bertabs import BertAbs, build_predictor
-
+from transformers import BertTokenizer
 from utils_summarization import (
     SummarizationDataset,
-    encode_for_summarization,
     build_mask,
-    fit_to_block_size,
     compute_token_type_ids,
+    encode_for_summarization,
+    fit_to_block_size,
 )
 
+
 logger = logging.getLogger(__name__)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/utils_summarization.py
index 96470f47a..360520fda 100644
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -1,5 +1,5 @@
-from collections import deque
 import os
+from collections import deque
 
 import torch
 from torch.utils.data import Dataset
diff --git a/examples/summarization/utils_summarization_test.py b/examples/summarization/utils_summarization_test.py
index 253eae388..86ec5b600 100644
--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
@@ -17,12 +17,7 @@ import unittest
 import numpy as np
 import torch
 
-from utils_summarization import (
-    compute_token_type_ids,
-    fit_to_block_size,
-    build_mask,
-    process_story,
-)
+from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
 
 
 class SummarizationDataProcessingTest(unittest.TestCase):
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 1293559c2..d27f5671a 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import sys
-import unittest
 import argparse
 import logging
+import sys
+import unittest
+
+import run_generation
+import run_glue
+import run_squad
+
 
 try:
     # python 3.4+ can use builtin unittest.mock instead of mock package
@@ -27,9 +30,6 @@ try:
 except ImportError:
     from mock import patch
 
-import run_glue
-import run_squad
-import run_generation
 
 logging.basicConfig(level=logging.DEBUG)
 
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 492eb23e3..987ffbc0e 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -17,16 +17,17 @@
 
 from __future__ import absolute_import, division, print_function
 
-
+import csv
+import glob
+import json
 import logging
 import os
 import sys
 from io import open
-import json
-import csv
-import glob
-import tqdm
 from typing import List
+
+import tqdm
+
 from transformers import PreTrainedTokenizer
 
 
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index d37583469..214064e84 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -21,6 +21,7 @@ import logging
 import os
 from io import open
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/hubconf.py b/hubconf.py
index 1d100271a..f8d0d1a84 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,13 +1,14 @@
 from transformers import (
-    AutoTokenizer,
     AutoConfig,
     AutoModel,
-    AutoModelWithLMHead,
-    AutoModelForSequenceClassification,
     AutoModelForQuestionAnswering,
+    AutoModelForSequenceClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
 )
 from transformers.file_utils import add_start_docstrings
 
+
 dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
 
 
diff --git a/setup.py b/setup.py
index 59dbfef12..13fe6d90f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,7 @@ To create the package for pypi.
 
 """
 from io import open
+
 from setuptools import find_packages, setup
 
 
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index 64e92f2a2..e7e95ede6 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -17,54 +17,55 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import glob
 import logging
 import os
 import random
-import glob
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
-
 from tqdm import tqdm, trange
 
 from transformers import (
     WEIGHTS_NAME,
+    AdamW,
     BertConfig,
     BertForQuestionAnswering,
     BertTokenizer,
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
+    DistilBertTokenizer,
     XLMConfig,
     XLMForQuestionAnswering,
     XLMTokenizer,
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
+    get_linear_schedule_with_warmup,
 )
-
-from transformers import AdamW, get_linear_schedule_with_warmup
-
 from utils_squad import (
-    read_squad_examples,
-    convert_examples_to_features,
     RawResult,
-    write_predictions,
     RawResultExtended,
+    convert_examples_to_features,
+    read_squad_examples,
+    write_predictions,
     write_predictions_extended,
 )
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+from utils_squad_evaluate import EVAL_OPTS
+from utils_squad_evaluate import main as evaluate_on_squad
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index bd016bd30..4c5b97bd5 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -16,16 +16,17 @@
 
 from __future__ import absolute_import, division, print_function
 
+import collections
 import json
 import logging
 import math
-import collections
 from io import open
 
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
+from utils_squad_evaluate import find_all_best_thresh_v2, get_raw_scores, make_qid_to_has_ans
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 370fbb569..9670b4f8c 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -19,11 +19,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import json
 import logging
 import sys
-import six
 from io import open
 
+import six
+
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index 99d376149..2e6c47347 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 """Convert XXX checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
+
 import torch
 
 from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index a4477704a..3e8f51bfd 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -21,21 +21,22 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import copy
+import itertools
 import json
 import logging
 import math
 import os
 import sys
-import copy
-import itertools
 from io import open
 
 import numpy as np
 import tensorflow as tf
 
 from .configuration_xxx import XxxConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 7270376ec..4ea3cca8c 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -20,22 +20,23 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import copy
+import itertools
 import json
 import logging
 import math
 import os
 import sys
-import copy
-import itertools
 from io import open
 
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_xxx import XxxConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 1e4f64042..b427df639 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import XxxConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import XxxConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 2043d7965..4191922eb 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
+
 if is_torch_available():
     from transformers import (
         XxxConfig,
diff --git a/templates/adding_a_new_model/tests/tokenization_xxx_test.py b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
index 940de5c76..087c1002d 100644
--- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
 
-from transformers.tokenization_bert import XxxTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index c1ea93a6d..30e3ce567 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -24,6 +24,7 @@ from io import open
 
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 ####################################################
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 318cd5ce4..8e52771f2 100755
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -15,86 +15,114 @@ except:
 
 import logging
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-# Files and general utilities
-from .file_utils import (
-    TRANSFORMERS_CACHE,
-    PYTORCH_TRANSFORMERS_CACHE,
-    PYTORCH_PRETRAINED_BERT_CACHE,
-    cached_path,
-    add_start_docstrings,
-    add_end_docstrings,
-    WEIGHTS_NAME,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    CONFIG_NAME,
-    MODEL_CARD_NAME,
-    is_tf_available,
-    is_torch_available,
-)
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
+from .configuration_mmbt import MMBTConfig
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
 
+# Configurations
+from .configuration_utils import PretrainedConfig
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
+from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
 from .data import (
-    is_sklearn_available,
+    DataProcessor,
     InputExample,
     InputFeatures,
-    DataProcessor,
     SingleSentenceClassificationProcessor,
-    glue_output_modes,
+    SquadExample,
+    SquadFeatures,
+    SquadV1Processor,
+    SquadV2Processor,
     glue_convert_examples_to_features,
+    glue_output_modes,
     glue_processors,
     glue_tasks_num_labels,
+    is_sklearn_available,
+    squad_convert_examples_to_features,
     xnli_output_modes,
     xnli_processors,
     xnli_tasks_num_labels,
-    squad_convert_examples_to_features,
-    SquadFeatures,
-    SquadExample,
-    SquadV1Processor,
-    SquadV2Processor,
 )
 
-if is_sklearn_available():
-    from .data import glue_compute_metrics, xnli_compute_metrics
+# Files and general utilities
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    TRANSFORMERS_CACHE,
+    WEIGHTS_NAME,
+    add_end_docstrings,
+    add_start_docstrings,
+    cached_path,
+    is_tf_available,
+    is_torch_available,
+)
 
 # Model Cards
 from .modelcard import ModelCard
 
-# Tokenizers
-from .tokenization_utils import PreTrainedTokenizer
+# TF 2.0 <=> PyTorch conversion utilities
+from .modeling_tf_pytorch_utils import (
+    convert_tf_weight_name_to_pt_weight_name,
+    load_pytorch_checkpoint_in_tf2_model,
+    load_pytorch_model_in_tf2_model,
+    load_pytorch_weights_in_tf2_model,
+    load_tf2_checkpoint_in_pytorch_model,
+    load_tf2_model_in_pytorch_model,
+    load_tf2_weights_in_pytorch_model,
+)
+
+# Pipelines
+from .pipelines import (
+    CsvPipelineDataFormat,
+    FeatureExtractionPipeline,
+    JsonPipelineDataFormat,
+    NerPipeline,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+    pipeline,
+)
+from .tokenization_albert import AlbertTokenizer
 from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
-from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLCorpus
-from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
+from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
-from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
-from .tokenization_albert import AlbertTokenizer
-from .tokenization_camembert import CamembertTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_roberta import RobertaTokenizer
 from .tokenization_t5 import T5Tokenizer
+from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
+
+# Tokenizers
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+if is_sklearn_available():
+    from .data import glue_compute_metrics, xnli_compute_metrics
 
-# Configurations
-from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_mmbt import MMBTConfig
 
 # Modeling
 if is_torch_available():
@@ -345,30 +373,6 @@ if is_tf_available():
     # Optimization
     from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
 
-# TF 2.0 <=> PyTorch conversion utilities
-from .modeling_tf_pytorch_utils import (
-    convert_tf_weight_name_to_pt_weight_name,
-    load_pytorch_checkpoint_in_tf2_model,
-    load_pytorch_weights_in_tf2_model,
-    load_pytorch_model_in_tf2_model,
-    load_tf2_checkpoint_in_pytorch_model,
-    load_tf2_weights_in_pytorch_model,
-    load_tf2_model_in_pytorch_model,
-)
-
-# Pipelines
-from .pipelines import (
-    pipeline,
-    PipelineDataFormat,
-    CsvPipelineDataFormat,
-    JsonPipelineDataFormat,
-    PipedPipelineDataFormat,
-    Pipeline,
-    FeatureExtractionPipeline,
-    QuestionAnsweringPipeline,
-    NerPipeline,
-    TextClassificationPipeline,
-)
 
 if not is_tf_available() and not is_torch_available():
     logger.warning(
diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py
index e358d8532..29b6b1076 100644
--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
@@ -1,5 +1,4 @@
 from argparse import ArgumentParser, Namespace
-
 from logging import getLogger
 
 from transformers import AutoModel, AutoTokenizer
diff --git a/transformers/commands/run.py b/transformers/commands/run.py
index 617226306..fdc88c55e 100644
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
@@ -2,7 +2,7 @@ import logging
 from argparse import ArgumentParser
 
 from transformers.commands import BaseTransformersCLICommand
-from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS
+from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
 
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py
index f7729c0bf..5d48cc0b3 100644
--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
@@ -1,7 +1,11 @@
+import logging
 from argparse import ArgumentParser, Namespace
-from typing import List, Optional, Union, Any
+from typing import Any, List, Optional, Union
+
+from transformers import Pipeline
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
-import logging
 
 try:
     from uvicorn import run
@@ -14,9 +18,6 @@ except (ImportError, AttributeError):
     Body = lambda *x, **y: None
     _serve_dependancies_installed = False
 
-from transformers import Pipeline
-from transformers.commands import BaseTransformersCLICommand
-from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
 logger = logging.getLogger("transformers-cli/serving")
 
diff --git a/transformers/commands/train.py b/transformers/commands/train.py
index e51be71c7..bf16a4f5e 100644
--- a/transformers/commands/train.py
+++ b/transformers/commands/train.py
@@ -2,13 +2,10 @@ import os
 from argparse import ArgumentParser, Namespace
 from logging import getLogger
 
+from transformers import SingleSentenceClassificationProcessor as Processor
+from transformers import TextClassificationPipeline, is_tf_available, is_torch_available
 from transformers.commands import BaseTransformersCLICommand
-from transformers import (
-    is_tf_available,
-    is_torch_available,
-    TextClassificationPipeline,
-    SingleSentenceClassificationProcessor as Processor,
-)
+
 
 if not is_tf_available() and not is_torch_available():
     raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index d29867d7c..c6edda280 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -1,6 +1,6 @@
+import os
 from argparse import ArgumentParser
 from getpass import getpass
-import os
 
 from transformers.commands import BaseTransformersCLICommand
 from transformers.hf_api import HfApi, HfFolder, HTTPError
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index dc2b74a29..1d6adfa7e 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -17,6 +17,7 @@
 
 from .configuration_utils import PretrainedConfig
 
+
 ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
     "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index e4311fc28..2c1d3f9d7 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -18,19 +18,20 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
+from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 7c5ee434a..21ab7e47c 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -24,6 +24,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_camembert.py b/transformers/configuration_camembert.py
index 9aa641aa5..12f7d591e 100644
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
@@ -21,6 +21,7 @@ import logging
 
 from .configuration_roberta import RobertaConfig
 
+
 logger = logging.getLogger(__name__)
 
 CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index 2726727d4..9b9a99960 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -23,6 +23,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index 120cbfb9f..8aae69ad0 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -15,13 +15,14 @@
 """ DistilBERT model configuration """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import json
 import logging
+import sys
 from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index adc8842ed..1bccdf9c4 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -24,6 +24,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_mmbt.py b/transformers/configuration_mmbt.py
index 5dad2babe..3d85d4448 100644
--- a/transformers/configuration_mmbt.py
+++ b/transformers/configuration_mmbt.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 53929aab5..81b2c82c6 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -24,6 +24,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_roberta.py b/transformers/configuration_roberta.py
index 3b8ddd1c4..7b1074abd 100644
--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -21,6 +21,7 @@ import logging
 
 from .configuration_bert import BertConfig
 
+
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 4584015e2..686e1af4d 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -19,11 +19,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import json
 import logging
 import sys
-import six
 from io import open
 
+import six
+
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index a2a7c5c02..49e6ce303 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -24,6 +24,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index f29899175..696930bb5 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -23,7 +23,8 @@ import logging
 import os
 from io import open
 
-from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
+from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index a98024e9e..cadf350c9 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -22,6 +22,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index fcf5c571d..bbd275ffe 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -21,6 +21,7 @@ import logging
 
 from .configuration_roberta import RobertaConfig
 
+
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 8768aeac9..5af883e8e 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -23,6 +23,7 @@ from io import open
 
 from .configuration_utils import PretrainedConfig
 
+
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
index 733f6fc5c..bba3269a9 100644
--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 """Convert ALBERT checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
+
 import torch
 
 from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index 9393068b1..87608f482 100755
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 """Convert BERT checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
+
 import torch
 
 from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
index 304c63450..c451521a4 100644
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -15,11 +15,13 @@
 
 """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 
-import os
 import argparse
-import torch
+import os
+
 import numpy as np
 import tensorflow as tf
+import torch
+
 from transformers import BertModel
 
 
diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index eeafdb81e..3aa895725 100755
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -17,13 +17,13 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
 from io import open
 
 import torch
 
 from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index c87bb9d59..25c2a0a00 100755
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -17,13 +17,13 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
 from io import open
 
 import torch
 
 from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index c7ad66e13..ba1dec53b 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -14,58 +14,59 @@
 # limitations under the License.
 """ Convert pytorch checkpoints to TensorFlow """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import os
 import argparse
-import tensorflow as tf
+import logging
+import os
 
-from transformers import is_torch_available, cached_path
+import tensorflow as tf
 
 from transformers import (
-    load_pytorch_checkpoint_in_tf2_model,
-    BertConfig,
-    TFBertForPreTraining,
-    TFBertForQuestionAnswering,
-    TFBertForSequenceClassification,
+    ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    GPT2Config,
-    TFGPT2LMHeadModel,
+    CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    XLNetConfig,
-    TFXLNetLMHeadModel,
-    XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    XLMConfig,
-    TFXLMWithLMHeadModel,
-    XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    TransfoXLConfig,
-    TFTransfoXLLMHeadModel,
-    TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    OpenAIGPTConfig,
-    TFOpenAIGPTLMHeadModel,
     OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    RobertaConfig,
-    TFRobertaForMaskedLM,
-    TFRobertaForSequenceClassification,
     ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    AlbertConfig,
+    BertConfig,
+    CTRLConfig,
     DistilBertConfig,
+    GPT2Config,
+    OpenAIGPTConfig,
+    RobertaConfig,
+    T5Config,
+    TFAlbertForMaskedLM,
+    TFBertForPreTraining,
+    TFBertForQuestionAnswering,
+    TFBertForSequenceClassification,
+    TFCTRLLMHeadModel,
     TFDistilBertForMaskedLM,
     TFDistilBertForQuestionAnswering,
     TFDistilBertForSequenceClassification,
-    DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    CTRLConfig,
-    TFCTRLLMHeadModel,
-    CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    AlbertConfig,
-    TFAlbertForMaskedLM,
-    ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    T5Config,
+    TFGPT2LMHeadModel,
+    TFOpenAIGPTLMHeadModel,
+    TFRobertaForMaskedLM,
+    TFRobertaForSequenceClassification,
     TFT5WithLMHeadModel,
-    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    TFTransfoXLLMHeadModel,
+    TFXLMWithLMHeadModel,
+    TFXLNetLMHeadModel,
+    TransfoXLConfig,
+    XLMConfig,
+    XLNetConfig,
+    cached_path,
+    is_torch_available,
+    load_pytorch_checkpoint_in_tf2_model,
 )
 
+
 if is_torch_available():
     import torch
     import numpy as np
@@ -158,8 +159,6 @@ else:
     )
 
 
-import logging
-
 logging.basicConfig(level=logging.INFO)
 
 MODEL_CLASSES = {
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 3dec4882f..5cf766b81 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -18,16 +18,13 @@ from __future__ import absolute_import, division, print_function
 
 import argparse
 import logging
-import numpy as np
-import torch
 import pathlib
 
-import fairseq
+import numpy as np
+import torch
 from packaging import version
 
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
+import fairseq
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
 from transformers.modeling_bert import (
@@ -47,6 +44,11 @@ from transformers.modeling_roberta import (
     RobertaModel,
 )
 
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
+
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
index 0b22a5f9c..853c9b717 100755
--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 """Convert T5 checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
+
 import torch
 
 from transformers import T5Config, T5Model, load_tf_weights_in_t5
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index f8dd45ae5..dc3241706 100755
--- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import logging
 import os
 import sys
 from io import open
@@ -24,17 +25,21 @@ from io import open
 import torch
 
 import transformers.tokenization_transfo_xl as data_utils
-
-from transformers import CONFIG_NAME, WEIGHTS_NAME
-from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
+from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    TransfoXLConfig,
+    TransfoXLLMHeadModel,
+    load_tf_weights_in_transfo_xl,
+)
 from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
 
+
 if sys.version_info[0] == 2:
     import cPickle as pickle
 else:
     import pickle
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 7cbf9cae9..ef98b76ab 100755
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -18,15 +18,15 @@ from __future__ import absolute_import, division, print_function
 
 import argparse
 import json
+import logging
 from io import open
 
-import torch
 import numpy
+import torch
 
 from transformers import CONFIG_NAME, WEIGHTS_NAME
 from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 83688cf07..37e93b7a1 100755
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -14,24 +14,25 @@
 # limitations under the License.
 """Convert BERT checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import os
 import argparse
+import logging
+import os
+
 import torch
 
 from transformers import (
     CONFIG_NAME,
     WEIGHTS_NAME,
     XLNetConfig,
-    XLNetLMHeadModel,
     XLNetForQuestionAnswering,
     XLNetForSequenceClassification,
+    XLNetLMHeadModel,
     load_tf_weights_in_xlnet,
 )
 
+
 GLUE_TASKS_NUM_LABELS = {
     "cola": 2,
     "mnli": 3,
@@ -44,7 +45,6 @@ GLUE_TASKS_NUM_LABELS = {
     "wnli": 2,
 }
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index bac6c6e3a..c0a3cbf4c 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,15 +1,23 @@
+from .metrics import is_sklearn_available
 from .processors import (
+    DataProcessor,
     InputExample,
     InputFeatures,
-    DataProcessor,
-    SquadFeatures,
     SingleSentenceClassificationProcessor,
+    SquadExample,
+    SquadFeatures,
+    SquadV1Processor,
+    SquadV2Processor,
+    glue_convert_examples_to_features,
+    glue_output_modes,
+    glue_processors,
+    glue_tasks_num_labels,
+    squad_convert_examples_to_features,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
 )
-from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
-from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
 
-from .metrics import is_sklearn_available
 
 if is_sklearn_available():
     from .metrics import glue_compute_metrics, xnli_compute_metrics
diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
index bd3b76efc..5264d501a 100644
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -15,8 +15,9 @@
 # limitations under the License.
 
 import csv
-import sys
 import logging
+import sys
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index a867fe3fd..0009a2e70 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -8,17 +8,19 @@ that a question is unanswerable.
 """
 
 
+import collections
 import json
 import logging
 import math
-import collections
+import re
+import string
 from io import open
+
 from tqdm import tqdm
-import string
-import re
 
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index e59e9fbcb..dee7f438a 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
-from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
-from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
+from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
+from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
 from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index f9c0132a7..e88773ac9 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -18,8 +18,9 @@
 import logging
 import os
 
-from .utils import DataProcessor, InputExample, InputFeatures
 from ...file_utils import is_tf_available
+from .utils import DataProcessor, InputExample, InputFeatures
+
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index efb10830b..d47211a0d 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -1,16 +1,17 @@
-from tqdm import tqdm
 import collections
+import json
 import logging
 import os
-import json
-import numpy as np
-from multiprocessing import Pool
-from multiprocessing import cpu_count
 from functools import partial
+from multiprocessing import Pool, cpu_count
+
+import numpy as np
+from tqdm import tqdm
 
+from ...file_utils import is_tf_available, is_torch_available
 from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
-from ...file_utils import is_tf_available, is_torch_available
+
 
 if is_torch_available():
     import torch
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index 41cc00d4b..7e044438a 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -14,14 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import csv
-import sys
 import copy
+import csv
 import json
 import logging
+import sys
 
 from ...file_utils import is_tf_available, is_torch_available
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/data/processors/xnli.py b/transformers/data/processors/xnli.py
index ffe0358c1..d67a53062 100644
--- a/transformers/data/processors/xnli.py
+++ b/transformers/data/processors/xnli.py
@@ -22,6 +22,7 @@ import os
 
 from .utils import DataProcessor, InputExample
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index c45bdee04..b1a4d240d 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -5,26 +5,27 @@ Copyright by the AllenNLP authors.
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
+import fnmatch
 import json
 import logging
 import os
-import six
+import sys
 import tempfile
-import fnmatch
+from contextlib import contextmanager
 from functools import partial, wraps
 from hashlib import sha256
 from io import open
 
 import boto3
+import requests
+import six
 from botocore.config import Config
 from botocore.exceptions import ClientError
-import requests
+from filelock import FileLock
 from tqdm.auto import tqdm
-from contextlib import contextmanager
+
 from . import __version__
 
-from filelock import FileLock
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 81cc9f7eb..f92c10df5 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -22,6 +22,7 @@ import six
 from requests.exceptions import HTTPError
 from tqdm import tqdm
 
+
 ENDPOINT = "https://huggingface.co"
 
 
diff --git a/transformers/modelcard.py b/transformers/modelcard.py
index e6b1982e9..bd218f0c4 100644
--- a/transformers/modelcard.py
+++ b/transformers/modelcard.py
@@ -23,15 +23,14 @@ import os
 from io import open
 
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-
 from .file_utils import (
     CONFIG_NAME,
     MODEL_CARD_NAME,
-    WEIGHTS_NAME,
     TF2_WEIGHTS_NAME,
+    WEIGHTS_NAME,
     cached_path,
-    is_remote_url,
     hf_bucket_url,
+    is_remote_url,
 )
 
 
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index 3d55bcd64..7ff200084 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -14,17 +14,21 @@
 # limitations under the License.
 """PyTorch ALBERT model. """
 
-import os
-import math
 import logging
+import math
+import os
+
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from transformers.modeling_utils import PreTrainedModel
+
 from transformers.configuration_albert import AlbertConfig
-from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
+from transformers.modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
+from transformers.modeling_utils import PreTrainedModel
+
 from .file_utils import add_start_docstrings
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 31e9ee6bd..bcdde45bd 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -29,80 +29,78 @@ from .configuration_auto import (
     RobertaConfig,
     TransfoXLConfig,
     XLMConfig,
-    XLNetConfig,
     XLMRobertaConfig,
+    XLNetConfig,
+)
+from .file_utils import add_start_docstrings
+from .modeling_albert import (
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForMaskedLM,
+    AlbertForQuestionAnswering,
+    AlbertForSequenceClassification,
+    AlbertModel,
 )
-
 from .modeling_bert import (
-    BertModel,
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     BertForMaskedLM,
-    BertForSequenceClassification,
     BertForQuestionAnswering,
+    BertForSequenceClassification,
     BertForTokenClassification,
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-)
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlnet import (
-    XLNetModel,
-    XLNetLMHeadModel,
-    XLNetForSequenceClassification,
-    XLNetForQuestionAnswering,
-    XLNetForTokenClassification,
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-)
-from .modeling_xlm import (
-    XLMModel,
-    XLMWithLMHeadModel,
-    XLMForSequenceClassification,
-    XLMForQuestionAnswering,
-    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    BertModel,
 )
-from .modeling_roberta import (
-    RobertaModel,
-    RobertaForMaskedLM,
-    RobertaForSequenceClassification,
-    RobertaForTokenClassification,
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_camembert import (
+    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    CamembertForMaskedLM,
+    CamembertForMultipleChoice,
+    CamembertForSequenceClassification,
+    CamembertForTokenClassification,
+    CamembertModel,
 )
+from .modeling_ctrl import CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRLModel
 from .modeling_distilbert import (
-    DistilBertModel,
-    DistilBertForQuestionAnswering,
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     DistilBertForMaskedLM,
+    DistilBertForQuestionAnswering,
     DistilBertForSequenceClassification,
     DistilBertForTokenClassification,
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertModel,
 )
-from .modeling_camembert import (
-    CamembertModel,
-    CamembertForMaskedLM,
-    CamembertForSequenceClassification,
-    CamembertForMultipleChoice,
-    CamembertForTokenClassification,
-    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
+from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
+from .modeling_roberta import (
+    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    RobertaModel,
 )
-from .modeling_albert import (
-    AlbertModel,
-    AlbertForMaskedLM,
-    AlbertForSequenceClassification,
-    AlbertForQuestionAnswering,
-    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5Model, T5WithLMHeadModel
+from .modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TransfoXLModel
+from .modeling_utils import PreTrainedModel, SequenceSummary
+from .modeling_xlm import (
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMForQuestionAnswering,
+    XLMForSequenceClassification,
+    XLMModel,
+    XLMWithLMHeadModel,
 )
-from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_xlm_roberta import (
-    XLMRobertaModel,
+    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLMRobertaForMaskedLM,
-    XLMRobertaForSequenceClassification,
     XLMRobertaForMultipleChoice,
+    XLMRobertaForSequenceClassification,
     XLMRobertaForTokenClassification,
-    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMRobertaModel,
+)
+from .modeling_xlnet import (
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForQuestionAnswering,
+    XLNetForSequenceClassification,
+    XLNetForTokenClassification,
+    XLNetLMHeadModel,
+    XLNetModel,
 )
 
-from .modeling_utils import PreTrainedModel, SequenceSummary
-
-from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 0994e832d..9c6cccf71 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -26,9 +26,10 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_bert import BertConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py
index 2a7a7a733..363399ee5 100644
--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -19,15 +19,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
+from .configuration_camembert import CamembertConfig
+from .file_utils import add_start_docstrings
 from .modeling_roberta import (
-    RobertaModel,
     RobertaForMaskedLM,
-    RobertaForSequenceClassification,
     RobertaForMultipleChoice,
+    RobertaForSequenceClassification,
     RobertaForTokenClassification,
+    RobertaModel,
 )
-from .configuration_camembert import CamembertConfig
-from .file_utils import add_start_docstrings
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index 37c15cf54..f34189170 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -24,15 +24,17 @@ import math
 import os
 import sys
 from io import open
+
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_ctrl import CTRLConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 7345c2365..5fef44384 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -18,25 +18,23 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import copy
+import itertools
 import json
 import logging
 import math
-import copy
 import sys
 from io import open
 
-import itertools
 import numpy as np
-
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_distilbert import DistilBertConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
 
-import logging
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index e5bad422c..dfdcc418d 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -26,6 +26,7 @@ from tqdm import trange
 
 from .modeling_auto import AutoModel, AutoModelWithLMHead
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index fe8a973f0..94e977e62 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -30,9 +30,10 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_mmbt.py b/transformers/modeling_mmbt.py
index 1c173ac69..2c22a409b 100644
--- a/transformers/modeling_mmbt.py
+++ b/transformers/modeling_mmbt.py
@@ -25,6 +25,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import add_start_docstrings
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index ed746ecac..3f37a4acf 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -30,9 +30,10 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index 730058ea9..f6233061a 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -23,9 +23,10 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
 from .configuration_roberta import RobertaConfig
 from .file_utils import add_start_docstrings
+from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 2ee8cd011..1467a0cd2 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -16,23 +16,24 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import copy
+import itertools
 import json
 import logging
 import math
 import os
 import sys
-import copy
-import itertools
 from io import open
 
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_t5 import T5Config
-from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index 25d086398..de6ef405c 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -22,11 +22,10 @@ import sys
 import tensorflow as tf
 
 from .configuration_albert import AlbertConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .file_utils import add_start_docstrings
+from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 
-import logging
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 24a7338d4..9ce83fe4d 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -29,62 +29,61 @@ from .configuration_auto import (
     XLMConfig,
     XLNetConfig,
 )
-
+from .file_utils import add_start_docstrings
+from .modeling_tf_albert import (
+    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFAlbertForMaskedLM,
+    TFAlbertForSequenceClassification,
+    TFAlbertModel,
+)
 from .modeling_tf_bert import (
-    TFBertModel,
+    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     TFBertForMaskedLM,
-    TFBertForSequenceClassification,
     TFBertForQuestionAnswering,
+    TFBertForSequenceClassification,
     TFBertForTokenClassification,
-    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-)
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_transfo_xl import (
-    TFTransfoXLModel,
-    TFTransfoXLLMHeadModel,
-    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-)
-from .modeling_tf_xlnet import (
-    TFXLNetModel,
-    TFXLNetLMHeadModel,
-    TFXLNetForSequenceClassification,
-    TFXLNetForQuestionAnsweringSimple,
-    TFXLNetForTokenClassification,
-    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFBertModel,
 )
-from .modeling_tf_xlm import (
-    TFXLMModel,
-    TFXLMWithLMHeadModel,
-    TFXLMForSequenceClassification,
-    TFXLMForQuestionAnsweringSimple,
-    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_tf_ctrl import TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, TFCTRLLMHeadModel, TFCTRLModel
+from .modeling_tf_distilbert import (
+    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFDistilBertForMaskedLM,
+    TFDistilBertForQuestionAnswering,
+    TFDistilBertForSequenceClassification,
+    TFDistilBertForTokenClassification,
+    TFDistilBertModel,
 )
+from .modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, TFGPT2LMHeadModel, TFGPT2Model
+from .modeling_tf_openai import TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
 from .modeling_tf_roberta import (
-    TFRobertaModel,
+    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
     TFRobertaForMaskedLM,
     TFRobertaForSequenceClassification,
     TFRobertaForTokenClassification,
-    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFRobertaModel,
 )
-from .modeling_tf_distilbert import (
-    TFDistilBertModel,
-    TFDistilBertForQuestionAnswering,
-    TFDistilBertForMaskedLM,
-    TFDistilBertForSequenceClassification,
-    TFDistilBertForTokenClassification,
-    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, TFT5Model, TFT5WithLMHeadModel
+from .modeling_tf_transfo_xl import (
+    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFTransfoXLLMHeadModel,
+    TFTransfoXLModel,
 )
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_albert import (
-    TFAlbertModel,
-    TFAlbertForMaskedLM,
-    TFAlbertForSequenceClassification,
-    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+from .modeling_tf_xlm import (
+    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFXLMForQuestionAnsweringSimple,
+    TFXLMForSequenceClassification,
+    TFXLMModel,
+    TFXLMWithLMHeadModel,
+)
+from .modeling_tf_xlnet import (
+    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TFXLNetForQuestionAnsweringSimple,
+    TFXLNetForSequenceClassification,
+    TFXLNetForTokenClassification,
+    TFXLNetLMHeadModel,
+    TFXLNetModel,
 )
-from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index bcb83d5df..4f919eab2 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -28,8 +28,9 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_bert import BertConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 3aba94a50..a3a22040d 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -21,12 +21,14 @@ import logging
 import os
 import sys
 from io import open
+
 import numpy as np
 import tensorflow as tf
 
 from .configuration_ctrl import CTRLConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, TFSharedEmbeddings
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index e9e89d2e7..98317488b 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -16,21 +16,21 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import copy
+import itertools
 import json
 import logging
 import math
-import copy
 import sys
 from io import open
 
-import itertools
-
 import numpy as np
 import tensorflow as tf
 
 from .configuration_distilbert import DistilBertConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index a4722fb34..bf551991b 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -28,16 +28,17 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
+from .configuration_gpt2 import GPT2Config
+from .file_utils import add_start_docstrings
 from .modeling_tf_utils import (
-    TFPreTrainedModel,
     TFConv1D,
-    TFSharedEmbeddings,
+    TFPreTrainedModel,
     TFSequenceSummary,
-    shape_list,
+    TFSharedEmbeddings,
     get_initializer,
+    shape_list,
 )
-from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index 4720e3c5d..44924b4f4 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -28,16 +28,17 @@ from io import open
 import numpy as np
 import tensorflow as tf
 
+from .configuration_openai import OpenAIGPTConfig
+from .file_utils import add_start_docstrings
 from .modeling_tf_utils import (
-    TFPreTrainedModel,
     TFConv1D,
-    TFSharedEmbeddings,
+    TFPreTrainedModel,
     TFSequenceSummary,
-    shape_list,
+    TFSharedEmbeddings,
     get_initializer,
+    shape_list,
 )
-from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index d1073d23a..f05b8aa4a 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -20,8 +20,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 import os
 import re
+
 import numpy
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 136ab6615..5c40682c3 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -22,10 +22,10 @@ import logging
 import tensorflow as tf
 
 from .configuration_roberta import RobertaConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
-
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 38a2bf419..e995bc3c9 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -17,16 +17,17 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import logging
-import math
 import copy
 import itertools
+import logging
+import math
 
 import tensorflow as tf
 
 from .configuration_t5 import T5Config
+from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
 from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
-from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index fc7ea932a..b6807d33d 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -18,11 +18,11 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
+import collections
 import json
-import math
 import logging
-import collections
+import math
+import os
 import sys
 from io import open
 
@@ -30,9 +30,10 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_transfo_xl import TransfoXLConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
-from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .file_utils import add_start_docstrings
+from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
+from .modeling_tf_utils import TFConv1D, TFPreTrainedModel, TFSequenceSummary, get_initializer, shape_list
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index 0f2a4ebeb..33244eae8 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -19,7 +19,6 @@
 from collections import defaultdict
 
 import numpy as np
-
 import tensorflow as tf
 
 from .modeling_tf_utils import shape_list
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 7ecd79afd..637013b37 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -20,22 +20,23 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 import os
 
+import h5py
 import tensorflow as tf
 from tensorflow.python.keras.saving import hdf5_format
-import h5py
 
 from .configuration_utils import PretrainedConfig
 from .file_utils import (
+    DUMMY_INPUTS,
     TF2_WEIGHTS_NAME,
     TF_WEIGHTS_NAME,
     WEIGHTS_NAME,
-    DUMMY_INPUTS,
     cached_path,
     hf_bucket_url,
     is_remote_url,
 )
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index 2f443ae2f..a29a0b7fe 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -16,24 +16,25 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import itertools
 import logging
 import math
 import os
 
-import itertools
 import numpy as np
 import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
+from .file_utils import add_start_docstrings
 from .modeling_tf_utils import (
+    DUMMY_INPUTS,
     TFPreTrainedModel,
-    TFSharedEmbeddings,
     TFSequenceSummary,
-    shape_list,
+    TFSharedEmbeddings,
     get_initializer,
-    DUMMY_INPUTS,
+    shape_list,
 )
-from .file_utils import add_start_docstrings
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index c1ed720f9..e913a0513 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -28,8 +28,8 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlnet import XLNetConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index cee61ed37..a9040b53d 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -20,11 +20,11 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
+import collections
 import json
-import math
 import logging
-import collections
+import math
+import os
 import sys
 from io import open
 
@@ -34,10 +34,11 @@ import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_transfo_xl import TransfoXLConfig
-from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits, LogUniformSampler
 from .file_utils import add_start_docstrings
+from .modeling_transfo_xl_utilities import LogUniformSampler, ProjectedAdaptiveLogSoftmax, sample_logits
+from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
index 89451bb55..c41954164 100644
--- a/transformers/modeling_transfo_xl_utilities.py
+++ b/transformers/modeling_transfo_xl_utilities.py
@@ -20,11 +20,11 @@
 from collections import defaultdict
 
 import numpy as np
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+
 # CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
 # CUDA_MINOR = int(torch.version.cuda.split('.')[1])
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index e934b9052..245a1afa0 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,15 +31,16 @@ from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
 from .file_utils import (
+    DUMMY_INPUTS,
     TF2_WEIGHTS_NAME,
     TF_WEIGHTS_NAME,
     WEIGHTS_NAME,
-    DUMMY_INPUTS,
     cached_path,
     hf_bucket_url,
     is_remote_url,
 )
 
+
 logger = logging.getLogger(__name__)
 
 try:
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index cd758a043..2127bbad3 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -16,20 +16,20 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import itertools
 import logging
 import math
 
-import itertools
 import numpy as np
-
 import torch
 from torch import nn
-from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn import functional as F
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead
 from .configuration_xlm import XLMConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 8f1ed6ec6..adf7f2334 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -19,15 +19,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
+from .configuration_xlm_roberta import XLMRobertaConfig
+from .file_utils import add_start_docstrings
 from .modeling_roberta import (
-    RobertaModel,
     RobertaForMaskedLM,
-    RobertaForSequenceClassification,
     RobertaForMultipleChoice,
+    RobertaForSequenceClassification,
     RobertaForTokenClassification,
+    RobertaModel,
 )
-from .configuration_xlm_roberta import XLMRobertaConfig
-from .file_utils import add_start_docstrings
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 2a210502d..423ba8cb7 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -26,19 +26,19 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn import functional as F
 
+from .configuration_xlnet import XLNetConfig
+from .file_utils import add_start_docstrings
 from .modeling_utils import (
-    PreTrainedModel,
-    prune_linear_layer,
-    SequenceSummary,
     PoolerAnswerClass,
     PoolerEndLogits,
     PoolerStartLogits,
+    PreTrainedModel,
+    SequenceSummary,
+    prune_linear_layer,
 )
-from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/optimization.py b/transformers/optimization.py
index 0cd57078b..814a0c5ba 100644
--- a/transformers/optimization.py
+++ b/transformers/optimization.py
@@ -21,6 +21,7 @@ import torch
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
index bdcbd323c..c2c8a3180 100644
--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
@@ -14,9 +14,7 @@
 # ==============================================================================
 """Functions and classes related to optimization (weight updates)."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import re
 
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index 4149c2e47..257a891ae 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -14,36 +14,36 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import csv
 import json
+import logging
 import os
 import pickle
-import logging
-import six
-
+import sys
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from itertools import groupby
 from os.path import abspath, exists
-from typing import Union, Optional, Tuple, List, Dict
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import six
 
 from transformers import (
+    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
     AutoConfig,
     AutoTokenizer,
-    PreTrainedTokenizer,
-    PretrainedConfig,
+    BasicTokenizer,
     ModelCard,
+    PretrainedConfig,
+    PreTrainedTokenizer,
     SquadExample,
-    squad_convert_examples_to_features,
     is_tf_available,
     is_torch_available,
-    BasicTokenizer,
-    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    squad_convert_examples_to_features,
 )
 
+
 if is_tf_available():
     import tensorflow as tf
     from transformers import (
diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index d109a655f..65a4a35ae 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -12,15 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import os
 import json
+import os
 import tempfile
-
 import unittest
+
 from .tokenization_tests_commons import TemporaryDirectory
 
 
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index 71963df10..0c86fab97 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -23,6 +23,7 @@ import six
 
 from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
 
+
 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
 FILES = [
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
index 30fe33a90..7a6f0721a 100644
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -14,11 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import json
+import os
 import unittest
 
 from transformers.modelcard import ModelCard
+
 from .tokenization_tests_commons import TemporaryDirectory
 
 
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index f798af95b..b2a0abe1f 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
+
 if is_torch_available():
     from transformers import (
         AlbertConfig,
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 3bdaa8a37..b2bb54ffa 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -12,17 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
-import shutil
 import logging
+import shutil
+import unittest
 
 from transformers import is_torch_available
 
-from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
+from .utils import SMALL_MODEL_IDENTIFIER, require_torch, slow
+
 
 if is_torch_available():
     from transformers import (
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 6711aded6..f7325eff9 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, floats_tensor, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
+
 if is_torch_available():
     from transformers import (
         BertConfig,
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 6834c78d1..42de8c9ae 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -12,26 +12,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import copy
-import sys
+import json
+import logging
 import os.path
+import random
 import shutil
+import sys
 import tempfile
-import json
-import random
-import uuid
-
 import unittest
-import logging
+import uuid
 
 from transformers import is_torch_available
 
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
+
 if is_torch_available():
     import torch
     import numpy as np
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index 9b71b1dd5..cdcd69104 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -11,23 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import pdb
+import unittest
 
 from transformers import is_torch_available
 
-if is_torch_available():
-    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
-
-from .modeling_common_test import CommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
+if is_torch_available():
+    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
+
+
 @require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):
 
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index 5b4f4683d..eee84af1c 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     from transformers import (
         DistilBertConfig,
@@ -30,10 +33,6 @@ if is_torch_available():
         DistilBertForSequenceClassification,
     )
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
index 491c502ba..b9cef6667 100644
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -17,8 +17,10 @@ import logging
 import unittest
 
 from transformers import is_torch_available
+
 from .utils import require_torch, slow
 
+
 if is_torch_available():
     from transformers import BertModel, BertForMaskedLM, Model2Model
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index 2706166b3..82ace8529 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     from transformers import (
         GPT2Config,
@@ -29,10 +32,6 @@ if is_torch_available():
         GPT2DoubleHeadsModel,
     )
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index f22a0b760..21ea556ac 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     from transformers import (
         OpenAIGPTConfig,
@@ -29,10 +32,6 @@ if is_torch_available():
         OpenAIGPTDoubleHeadsModel,
     )
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 451dafe08..e6909deae 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     import torch
     from transformers import (
@@ -32,10 +35,6 @@ if is_torch_available():
     from transformers.modeling_roberta import RobertaEmbeddings
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 3feb61a62..2bf3bdae1 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, floats_tensor, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
+
 if is_torch_available():
     from transformers import T5Config, T5Model, T5WithLMHeadModel
     from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 0406592d5..344e999a0 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import AlbertConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import AlbertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index d695474ec..54581505e 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -12,17 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
-import shutil
 import logging
+import shutil
+import unittest
 
 from transformers import is_tf_available
 
-from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
+from .utils import SMALL_MODEL_IDENTIFIER, require_tf, slow
+
 
 if is_tf_available():
     from transformers import (
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index e36e3a2c3..735de447e 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import BertConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import BertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index d65e270ae..6f2d62cc9 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -14,23 +14,23 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import os
 import copy
+import importlib
 import json
 import logging
-import importlib
+import os
 import random
 import shutil
+import sys
+import tempfile
 import unittest
 import uuid
-import tempfile
-
-import sys
 
 from transformers import is_tf_available, is_torch_available
 
 from .utils import require_tf, slow
 
+
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index fb8c4c255..895579eab 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import CTRLConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import CTRLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index 3260f63d5..ebb17e298 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -12,17 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
+from transformers import DistilBertConfig, is_tf_available
+
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import DistilBertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 09b7eb071..49bb10c43 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import GPT2Config, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import GPT2Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index a59395e02..0198527f5 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import OpenAIGPTConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import OpenAIGPTConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 23ea55740..3b9f1961b 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -12,17 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
+from transformers import RobertaConfig, is_tf_available
+
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import RobertaConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index 521085219..2108b9007 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import sys
+import unittest
+
+from transformers import T5Config, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import T5Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 20de598d0..2b17668a9 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -12,18 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import random
+import unittest
+
+from transformers import TransfoXLConfig, is_tf_available
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
 
-from transformers import TransfoXLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index 9162bf2b3..0850cecb0 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_tf_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_tf, slow
+
+
 if is_tf_available():
     import tensorflow as tf
     from transformers import (
@@ -31,10 +34,6 @@ if is_tf_available():
         TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_tf, slow
-
 
 @require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 9a56384a0..67fc1a5ce 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -12,17 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import os
-import unittest
 import json
+import os
 import random
+import unittest
 
 from transformers import XLNetConfig, is_tf_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_tf, slow
+
+
 if is_tf_available():
     import tensorflow as tf
 
@@ -35,10 +38,6 @@ if is_tf_available():
         TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
-from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_tf, slow
-
 
 @require_tf
 class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index f04205d4e..4289483a8 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -12,24 +12,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import random
+import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     import torch
     from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
     from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index 843693fd0..a0cc8e69f 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     from transformers import (
         XLMConfig,
@@ -31,10 +34,6 @@ if is_torch_available():
     )
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class XLMModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 487756a5c..ac0e542cc 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -12,17 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import os
-import unittest
 import json
+import os
 import random
+import unittest
 
 from transformers import is_torch_available
 
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
 if is_torch_available():
     import torch
 
@@ -36,10 +39,6 @@ if is_torch_available():
     )
     from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import CommonTestCases, ids_tensor
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
 
 @require_torch
 class XLNetModelTest(CommonTestCases.CommonModelTester):
diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
index 0addcde1d..c0cef1e38 100644
--- a/transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -12,15 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
 import os
+import unittest
 
 from transformers import is_torch_available
 
+from .tokenization_tests_commons import TemporaryDirectory
+from .utils import require_torch
+
+
 if is_torch_available():
     import torch
 
@@ -33,9 +35,6 @@ if is_torch_available():
         get_linear_schedule_with_warmup,
     )
 
-from .tokenization_tests_commons import TemporaryDirectory
-from .utils import require_torch
-
 
 def unwrap_schedule(scheduler, num_steps=10):
     lrs = []
diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py
index e88ee971e..4058aaf83 100644
--- a/transformers/tests/optimization_tf_test.py
+++ b/transformers/tests/optimization_tf_test.py
@@ -1,6 +1,4 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
 
@@ -8,6 +6,7 @@ from transformers import is_tf_available
 
 from .utils import require_tf
 
+
 if is_tf_available():
     import tensorflow as tf
     from tensorflow.python.eager import context
diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py
index 3c258594d..2dfbdaaa0 100644
--- a/transformers/tests/pipelines_test.py
+++ b/transformers/tests/pipelines_test.py
@@ -1,10 +1,10 @@
 import unittest
-
 from typing import Iterable
 
 from transformers import pipeline
 from transformers.tests.utils import require_tf, require_torch
 
+
 QA_FINETUNED_MODELS = {
     ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
     ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
diff --git a/transformers/tests/tokenization_albert_test.py b/transformers/tests/tokenization_albert_test.py
index 7d7e793b5..867dd5591 100644
--- a/transformers/tests/tokenization_albert_test.py
+++ b/transformers/tests/tokenization_albert_test.py
@@ -17,10 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_albert import AlbertTokenizer, SPIECE_UNDERLINE
+from transformers.tokenization_albert import SPIECE_UNDERLINE, AlbertTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
 
 
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 7d77bf5b2..4ff2fa791 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -12,18 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import unittest
-import shutil
 import logging
+import shutil
+import unittest
 
-from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
-from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from transformers import (
+    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    AutoTokenizer,
+    BertTokenizer,
+    GPT2Tokenizer,
+)
 
-from .utils import slow, SMALL_MODEL_IDENTIFIER
+from .utils import SMALL_MODEL_IDENTIFIER, slow
 
 
 class AutoTokenizerTest(unittest.TestCase):
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 02eb8c0a6..84119c081 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -20,14 +20,14 @@ from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
 from transformers.tokenization_bert_japanese import (
+    VOCAB_FILES_NAMES,
     BertJapaneseTokenizer,
-    MecabTokenizer,
     CharacterTokenizer,
-    VOCAB_FILES_NAMES,
+    MecabTokenizer,
 )
 
 from .tokenization_tests_commons import CommonTestCases
-from .utils import slow, custom_tokenizers
+from .utils import custom_tokenizers, slow
 
 
 @custom_tokenizers
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index bf023761a..9c8c18fe4 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -19,13 +19,13 @@ import unittest
 from io import open
 
 from transformers.tokenization_bert import (
+    VOCAB_FILES_NAMES,
     BasicTokenizer,
     BertTokenizer,
     WordpieceTokenizer,
     _is_control,
     _is_punctuation,
     _is_whitespace,
-    VOCAB_FILES_NAMES,
 )
 
 from .tokenization_tests_commons import CommonTestCases
diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py
index 04c9dec52..eb3fbb9da 100644
--- a/transformers/tests/tokenization_ctrl_test.py
+++ b/transformers/tests/tokenization_ctrl_test.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import json
 import os
 import unittest
-import json
 from io import open
 
-from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index 551f9e188..b7760e0eb 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -20,8 +20,8 @@ from io import open
 
 from transformers.tokenization_distilbert import DistilBertTokenizer
 
-from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
+from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
 
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index 552b73416..9246e5ce1 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import json
 import os
 import unittest
-import json
 from io import open
 
-from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py
index c6a802b7b..fe4ed77c1 100644
--- a/transformers/tests/tokenization_openai_test.py
+++ b/transformers/tests/tokenization_openai_test.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import json
 import os
 import unittest
-import json
 
-from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index a1d9d5fb7..92a1a6d5d 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -14,12 +14,13 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import json
+import os
 import unittest
 from io import open
 
-from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer
+
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
index 09bc0267f..69f209f29 100644
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
@@ -22,6 +22,7 @@ from transformers.tokenization_xlnet import SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
+
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
 
 
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index ba8110108..79b4bf781 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -15,11 +15,12 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
+import shutil
 import sys
-from io import open
 import tempfile
-import shutil
 import unittest
+from io import open
+
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index 8b737283d..cb9d3d4de 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -20,13 +20,14 @@ from io import open
 
 from transformers import is_torch_available
 
+from .tokenization_tests_commons import CommonTestCases
+from .utils import require_torch
+
+
 if is_torch_available():
     import torch
     from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import CommonTestCases
-from .utils import require_torch
-
 
 @require_torch
 class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index 4fa92c44b..76681b1af 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -12,11 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import unittest
+
 import six
 
 from transformers import PreTrainedTokenizer
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index e9aa2b7d0..3ce501535 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import json
 import os
 import unittest
-import json
 
-from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index 32482449a..2c55a337b 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -17,11 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
+
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
 
 
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index aab5e5a8a..66ff53d6e 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -1,7 +1,6 @@
 import os
-import unittest
 import tempfile
-
+import unittest
 from distutils.util import strtobool
 
 from transformers.file_utils import _tf_available, _torch_available
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index b03b3ca11..276a33cbf 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -15,13 +15,16 @@
 """ Tokenization classes for ALBERT model."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-from .tokenization_utils import PreTrainedTokenizer
 import logging
-import unicodedata
-import six
 import os
+import unicodedata
 from shutil import copyfile
 
+import six
+
+from .tokenization_utils import PreTrainedTokenizer
+
+
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index 5d36fdcba..7077ec134 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -18,20 +18,21 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
+from .tokenization_albert import AlbertTokenizer
 from .tokenization_bert import BertTokenizer
 from .tokenization_bert_japanese import BertJapaneseTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
-from .tokenization_transfo_xl import TransfoXLTokenizer
-from .tokenization_xlnet import XLNetTokenizer
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
-from .tokenization_camembert import CamembertTokenizer
-from .tokenization_albert import AlbertTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_roberta import RobertaTokenizer
 from .tokenization_t5 import T5Tokenizer
+from .tokenization_transfo_xl import TransfoXLTokenizer
+from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
+from .tokenization_xlnet import XLNetTokenizer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 7b3705cc1..fc1c918df 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -24,6 +24,7 @@ from io import open
 
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 48b9b04b4..80c499051 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -19,13 +19,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import collections
 import logging
 import os
-import six
 import unicodedata
 from io import open
 
-from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+import six
+
+from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py
index c1e80e0e0..c5ae705f5 100644
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -20,9 +20,12 @@ import os
 from shutil import copyfile
 
 import sentencepiece as spm
+
 from transformers.tokenization_utils import PreTrainedTokenizer
+
 from .tokenization_xlnet import SPIECE_UNDERLINE
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 2ce2bbf09..5b401f91f 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import json
 import logging
 import os
-import regex as re
 from io import open
 
+import regex as re
+
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index 7fed1e405..bda5c6661 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -24,6 +24,7 @@ from io import open
 
 from .tokenization_bert import BertTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index b6a0e7b78..06da88850 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -15,13 +15,17 @@
 """Tokenization classes for OpenAI GPT."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import json
 import logging
 import os
-import regex as re
+import sys
 from io import open
 
+import regex as re
+
+from .tokenization_utils import PreTrainedTokenizer
+
+
 try:
     from functools import lru_cache
 except ImportError:
@@ -31,8 +35,6 @@ except ImportError:
         return lambda func: func
 
 
-from .tokenization_utils import PreTrainedTokenizer
-
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index d8f7549ed..4ea182c67 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -21,8 +21,9 @@ import os
 import re
 from io import open
 
-from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
+from .tokenization_utils import PreTrainedTokenizer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index eae8b638f..95472f5b3 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -15,15 +15,17 @@
 """Tokenization classes for RoBERTa."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import json
 import logging
 import os
-import regex as re
+import sys
 from io import open
 
+import regex as re
+
 from .tokenization_gpt2 import GPT2Tokenizer
 
+
 try:
     from functools import lru_cache
 except ImportError:
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 3b70d4085..8eb589cd1 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -19,11 +19,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 import os
 import re
-import six
 from shutil import copyfile
 
+import six
+
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 SPIECE_UNDERLINE = "▁"
diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
index b2f59625f..ce058580b 100644
--- a/transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -30,6 +30,7 @@ import numpy as np
 from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer
 
+
 try:
     import torch
 except ImportError:
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f848785ee..6cc1bedd8 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -15,16 +15,18 @@
 """Tokenization classes for OpenAI GPT."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import logging
-import os
-import json
-import six
 import copy
 import itertools
+import json
+import logging
+import os
 import re
 from io import open
 
-from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
+import six
+
+from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
+
 
 if is_tf_available():
     import tensorflow as tf
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 9b96b92f2..7ef53cf80 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -25,8 +25,9 @@ from io import open
 
 import sacremoses as sm
 
-from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
+from .tokenization_utils import PreTrainedTokenizer
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index 30814c3a1..de71f87d0 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -20,9 +20,12 @@ import os
 from shutil import copyfile
 
 import sentencepiece as spm
+
 from transformers.tokenization_utils import PreTrainedTokenizer
+
 from .tokenization_xlnet import SPIECE_UNDERLINE
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 8ea0ccb77..6c016728e 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -17,13 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 import os
+import unicodedata
 from shutil import copyfile
 
-import unicodedata
 import six
 
 from .tokenization_utils import PreTrainedTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index 7262dd720..99eac9421 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -18,14 +18,15 @@ rm MSRParaphraseCorpus.msi
 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
 """
 
+import argparse
 import os
-import sys
 import shutil
-import argparse
+import sys
 import tempfile
 import urllib.request
 import zipfile
 
+
 TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
 TASK2PATH = {
     "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",
-- 
GitLab


From 9e80fc7b2fe4ae0dd1778c37149afd5b629adbde Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 15:58:40 +0100
Subject: [PATCH 07/32] Enforce isort in CI.

We need https://github.com/timothycrosley/isort/pull/1000 but there's no
release with this fix yet, so we'll install from GitHub.
---
 .circleci/config.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 500941548..89b78ae69 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -90,12 +90,15 @@ jobs:
         working_directory: ~/transformers
         docker:
             - image: circleci/python:3.6
-        resource_class: small
+        resource_class: medium
         parallelism: 1
         steps:
             - checkout
-            - run: sudo pip install black
+            - run: sudo pip install --editable .
+            - run: sudo pip install torch tensorflow tensorboardX scikit-learn
+            - run: sudo pip install black git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
             - run: black --check --line-length 119 examples templates transformers utils
+            - run: isort --check-only --recursive examples templates transformers utils
     check_repository_consistency:
         working_directory: ~/transformers
         docker:
-- 
GitLab


From 5ca054757f28bc4c4bcb3c035681a0f4890e0a21 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 16:00:11 +0100
Subject: [PATCH 08/32] Update "make style" to sort imports with isort.

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index afc197c56..77aa43983 100644
--- a/Makefile
+++ b/Makefile
@@ -2,3 +2,4 @@
 
 style:
 	black --line-length 119 examples templates transformers utils
+	isort --recursive examples templates transformers utils
-- 
GitLab


From d0c9fe277a012d4ba8b3133ae1b39ff92c9a6dc7 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 16:29:21 +0100
Subject: [PATCH 09/32] Fix circular import in transformers.pipelines.

Submodules shouldn't import from their parent in general.
---
 transformers/pipelines.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index 257a891ae..3ddadc0cb 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -29,24 +29,19 @@ from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import six
 
-from transformers import (
-    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    AutoConfig,
-    AutoTokenizer,
-    BasicTokenizer,
-    ModelCard,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-    SquadExample,
-    is_tf_available,
-    is_torch_available,
-    squad_convert_examples_to_features,
-)
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_utils import PretrainedConfig
+from .data import SquadExample, squad_convert_examples_to_features
+from .file_utils import is_tf_available, is_torch_available
+from .modelcard import ModelCard
+from .tokenization_auto import AutoTokenizer
+from .tokenization_bert import BasicTokenizer
+from .tokenization_utils import PreTrainedTokenizer
 
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers import (
+    from .modeling_tf_auto import (
         TFAutoModel,
         TFAutoModelForSequenceClassification,
         TFAutoModelForQuestionAnswering,
@@ -55,7 +50,7 @@ if is_tf_available():
 
 if is_torch_available():
     import torch
-    from transformers import (
+    from .modeling_auto import (
         AutoModel,
         AutoModelForSequenceClassification,
         AutoModelForQuestionAnswering,
-- 
GitLab


From 1efa0a7552e2d8ae2f4523bbf589f999af11d3ab Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 17:06:41 +0100
Subject: [PATCH 10/32] Add black-compatible flake8 configuration.

---
 setup.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 326d97a81..1b24e6d1e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,3 +7,7 @@ line_length = 119
 lines_after_imports = 2
 multi_line_output = 3
 use_parentheses = True
+
+[flake8]
+ignore = E203, E501, W503
+max-line-length = 119
-- 
GitLab


From 28e608a2c2340ec33b8f5f49d36626eb12811866 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 17:02:36 +0100
Subject: [PATCH 11/32] Remove trailing whitespace from all Python files.

Fixes flake8 warning W291 (x224).
---
 examples/benchmarks.py                        | 418 +++++++++---------
 hubconf.py                                    |   4 +-
 .../adding_a_new_model/modeling_tf_xxx.py     |   6 +-
 templates/adding_a_new_model/modeling_xxx.py  |  10 +-
 transformers/commands/user.py                 |  10 +-
 transformers/data/processors/squad.py         |   6 +-
 transformers/modeling_albert.py               |  10 +-
 transformers/modeling_auto.py                 |   4 +-
 transformers/modeling_camembert.py            |  10 +-
 transformers/modeling_ctrl.py                 |   8 +-
 transformers/modeling_distilbert.py           |   6 +-
 transformers/modeling_gpt2.py                 |  14 +-
 transformers/modeling_mmbt.py                 |  12 +-
 transformers/modeling_roberta.py              |  12 +-
 transformers/modeling_t5.py                   |   2 +-
 transformers/modeling_tf_albert.py            |   6 +-
 transformers/modeling_tf_ctrl.py              |   2 +-
 transformers/modeling_tf_distilbert.py        |   8 +-
 transformers/modeling_tf_gpt2.py              |   4 +-
 transformers/modeling_tf_openai.py            |   2 +-
 transformers/modeling_tf_roberta.py           |  12 +-
 transformers/modeling_tf_t5.py                |   2 +-
 transformers/modeling_tf_utils.py             |   4 +-
 transformers/modeling_xlm.py                  |   4 +-
 transformers/modeling_xlm_roberta.py          |   8 +-
 transformers/tokenization_utils.py            |   4 +-
 utils/download_glue_data.py                   |   2 +-
 27 files changed, 295 insertions(+), 295 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 4ef0640e3..07de19d4b 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -36,215 +36,215 @@ if is_torch_available():
     from transformers import AutoModel
 
 
-input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as 
-the Director of Hatcheries and Conditioning entered the room, in the 
-
-
-
-scarcely breathing silence, the absent-minded, soliloquizing hum or 
-whistle, of absorbed concentration. A troop of newly arrived students, 
-very young, pink and callow, followed nervously, rather abjectly, at the 
-Director's heels. Each of them carried a notebook, in which, whenever 
-the great man spoke, he desperately scribbled. Straight from the 
-horse's mouth. It was a rare privilege. The D. H. C. for Central London 
-always made a point of personally conducting his new students round 
-the various departments. 
-
-"Just to give you a general idea," he would explain to them. For of 
-course some sort of general idea they must have, if they were to do 
-their work intelligently-though as little of one, if they were to be good 
-and happy members of society, as possible. For particulars, as every 
-one knows, make for virtue and happiness; generalities are intellectu- 
-ally necessary evils. Not philosophers but fret-sawyers and stamp col- 
-lectors compose the backbone of society. 
-
-"To-morrow," he would add, smiling at them with a slightly menacing 
-geniality, "you'll be settling down to serious work. You won't have time 
-for generalities. Meanwhile ..." 
-
-Meanwhile, it was a privilege. Straight from the horse's mouth into the 
-notebook. The boys scribbled like mad. 
-
-Tall and rather thin but upright, the Director advanced into the room. 
-He had a long chin and big rather prominent teeth, just covered, when 
-he was not talking, by his full, floridly curved lips. Old, young? Thirty? 
-Fifty? Fifty-five? It was hard to say. And anyhow the question didn't 
-arise; in this year of stability, A. F. 632, it didn't occur to you to ask it. 
-
-"I shall begin at the beginning," said the D.H.C. and the more zealous 
-students recorded his intention in their notebooks: Begin at the begin- 
-ning. "These," he waved his hand, "are the incubators." And opening 
-an insulated door he showed them racks upon racks of numbered test- 
-tubes. "The week's supply of ova. Kept," he explained, "at blood heat; 
-whereas the male gametes," and here he opened another door, "they 
-have to be kept at thirty-five instead of thirty-seven. Full blood heat 
-sterilizes." Rams wrapped in theremogene beget no lambs. 
-
-Still leaning against the incubators he gave them, while the pencils 
-scurried illegibly across the pages, a brief description of the modern 
-
-
-
-fertilizing process; spoke first, of course, of its surgical introduc- 
-tion-"the operation undergone voluntarily for the good of Society, not 
-to mention the fact that it carries a bonus amounting to six months' 
-salary"; continued with some account of the technique for preserving 
-the excised ovary alive and actively developing; passed on to a consid- 
-eration of optimum temperature, salinity, viscosity; referred to the liq- 
-uor in which the detached and ripened eggs were kept; and, leading 
-his charges to the work tables, actually showed them how this liquor 
-was drawn off from the test-tubes; how it was let out drop by drop 
-onto the specially warmed slides of the microscopes; how the eggs 
-which it contained were inspected for abnormalities, counted and 
-transferred to a porous receptacle; how (and he now took them to 
-watch the operation) this receptacle was immersed in a warm bouillon 
-containing free-swimming spermatozoa-at a minimum concentration 
-of one hundred thousand per cubic centimetre, he insisted; and how, 
-after ten minutes, the container was lifted out of the liquor and its 
-contents re-examined; how, if any of the eggs remained unfertilized, it 
-was again immersed, and, if necessary, yet again; how the fertilized 
-ova went back to the incubators; where the Alphas and Betas re- 
-mained until definitely bottled; while the Gammas, Deltas and Epsilons 
-were brought out again, after only thirty-six hours, to undergo Bo- 
-kanovsky's Process. 
-
-"Bokanovsky's Process," repeated the Director, and the students un- 
-derlined the words in their little notebooks. 
-
-One egg, one embryo, one adult-normality. But a bokanovskified egg 
-will bud, will proliferate, will divide. From eight to ninety-six buds, and 
-every bud will grow into a perfectly formed embryo, and every embryo 
-into a full-sized adult. Making ninety-six human beings grow where 
-only one grew before. Progress. 
-
-"Essentially," the D.H.C. concluded, "bokanovskification consists of a 
-series of arrests of development. We check the normal growth and, 
-paradoxically enough, the egg responds by budding." 
-
-Responds by budding. The pencils were busy. 
-
-He pointed. On a very slowly moving band a rack-full of test-tubes was 
-entering a large metal box, another, rack-full was emerging. Machinery 
-faintly purred. It took eight minutes for the tubes to go through, he 
-
-
-
-told them. Eight minutes of hard X-rays being about as much as an 
-egg can stand. A few died; of the rest, the least susceptible divided 
-into two; most put out four buds; some eight; all were returned to the 
-incubators, where the buds began to develop; then, after two days, 
-were suddenly chilled, chilled and checked. Two, four, eight, the buds 
-in their turn budded; and having budded were dosed almost to death 
-with alcohol; consequently burgeoned again and having budded-bud 
-out of bud out of bud-were thereafter-further arrest being generally 
-fatal-left to develop in peace. By which time the original egg was in a 
-fair way to becoming anything from eight to ninety-six embryos- a 
-prodigious improvement, you will agree, on nature. Identical twins-but 
-not in piddling twos and threes as in the old viviparous days, when an 
-egg would sometimes accidentally divide; actually by dozens, by 
-scores at a time. 
-
-"Scores," the Director repeated and flung out his arms, as though he 
-were distributing largesse. "Scores." 
-
-But one of the students was fool enough to ask where the advantage 
-lay. 
-
-"My good boy!" The Director wheeled sharply round on him. "Can't you 
-see? Can't you see?" He raised a hand; his expression was solemn. 
-"Bokanovsky's Process is one of the major instruments of social stabil- 
-ity!" 
-
-Major instruments of social stability. 
-
-Standard men and women; in uniform batches. The whole of a small 
-factory staffed with the products of a single bokanovskified egg. 
-
-"Ninety-six identical twins working ninety-six identical machines!" The 
-voice was almost tremulous with enthusiasm. "You really know where 
-you are. For the first time in history." He quoted the planetary motto. 
-"Community, Identity, Stability." Grand words. "If we could bo- 
-kanovskify indefinitely the whole problem would be solved." 
-
-Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil- 
-lions of identical twins. The principle of mass production at last applied 
-to biology. 
-
-
-
-"But, alas," the Director shook his head, "we can't bokanovskify indefi- 
-nitely." 
-
-Ninety-six seemed to be the limit; seventy-two a good average. From 
-the same ovary and with gametes of the same male to manufacture as 
-many batches of identical twins as possible-that was the best (sadly a 
-second best) that they could do. And even that was difficult. 
-
-"For in nature it takes thirty years for two hundred eggs to reach ma- 
-turity. But our business is to stabilize the population at this moment, 
-here and now. Dribbling out twins over a quarter of a century-what 
-would be the use of that?" 
-
-Obviously, no use at all. But Podsnap's Technique had immensely ac- 
-celerated the process of ripening. They could make sure of at least a 
-hundred and fifty mature eggs within two years. Fertilize and bo- 
-kanovskify-in other words, multiply by seventy-two-and you get an 
-average of nearly eleven thousand brothers and sisters in a hundred 
-and fifty batches of identical twins, all within two years of the same 
-age. 
-
-"And in exceptional cases we can make one ovary yield us over fifteen 
-thousand adult individuals." 
-
-Beckoning to a fair-haired, ruddy young man who happened to be 
-passing at the moment. "Mr. Foster," he called. The ruddy young man 
-approached. "Can you tell us the record for a single ovary, Mr. Foster?" 
-
-"Sixteen thousand and twelve in this Centre," Mr. Foster replied with- 
-out hesitation. He spoke very quickly, had a vivacious blue eye, and 
-took an evident pleasure in quoting figures. "Sixteen thousand and 
-twelve; in one hundred and eighty-nine batches of identicals. But of 
-course they've done much better," he rattled on, "in some of the tropi- 
-cal Centres. Singapore has often produced over sixteen thousand five 
-hundred; and Mombasa has actually touched the seventeen thousand 
-mark. But then they have unfair advantages. You should see the way a 
-negro ovary responds to pituitary! It's quite astonishing, when you're 
-used to working with European material. Still," he added, with a laugh 
-(but the light of combat was in his eyes and the lift of his chin was 
-challenging), "still, we mean to beat them if we can. I'm working on a 
-wonderful Delta-Minus ovary at this moment. Only just eighteen 
-
-
-
-months old. Over twelve thousand seven hundred children already, ei- 
-ther decanted or in embryo. And still going strong. We'll beat them 
-yet." 
-
-"That's the spirit I like!" cried the Director, and clapped Mr. Foster on 
-the shoulder. "Come along with us, and give these boys the benefit of 
-your expert knowledge." 
-
-Mr. Foster smiled modestly. "With pleasure." They went. 
-In the Bottling Room all was harmonious bustle and ordered activity. 
-Flaps of fresh sow's peritoneum ready cut to the proper size came 
-shooting up in little lifts from the Organ Store in the sub-basement. 
-Whizz and then, click! the lift-hatches hew open; the bottle-liner had 
-only to reach out a hand, take the flap, insert, smooth-down, and be- 
-fore the lined bottle had had time to travel out of reach along the end- 
-less band, whizz, click! another flap of peritoneum had shot up from 
-the depths, ready to be slipped into yet another bottle, the next of that 
-slow interminable procession on the band. 
-
-Next to the Liners stood the Matriculators. The procession advanced; 
-one by one the eggs were transferred from their test-tubes to the 
-larger containers; deftly the peritoneal lining was slit, the morula 
-dropped into place, the saline solution poured in ... and already the 
-bottle had passed, and it was the turn of the labellers. Heredity, date 
-of fertilization, membership of Bokanovsky Group-details were trans- 
-ferred from test-tube to bottle. No longer anonymous, but named, 
-identified, the procession marched slowly on; on through an opening in 
-the wall, slowly on into the Social Predestination Room. 
-"Eighty-eight cubic metres of card-index," said Mr. Foster with relish, 
+input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
+the Director of Hatcheries and Conditioning entered the room, in the
+
+
+
+scarcely breathing silence, the absent-minded, soliloquizing hum or
+whistle, of absorbed concentration. A troop of newly arrived students,
+very young, pink and callow, followed nervously, rather abjectly, at the
+Director's heels. Each of them carried a notebook, in which, whenever
+the great man spoke, he desperately scribbled. Straight from the
+horse's mouth. It was a rare privilege. The D. H. C. for Central London
+always made a point of personally conducting his new students round
+the various departments.
+
+"Just to give you a general idea," he would explain to them. For of
+course some sort of general idea they must have, if they were to do
+their work intelligently-though as little of one, if they were to be good
+and happy members of society, as possible. For particulars, as every
+one knows, make for virtue and happiness; generalities are intellectu-
+ally necessary evils. Not philosophers but fret-sawyers and stamp col-
+lectors compose the backbone of society.
+
+"To-morrow," he would add, smiling at them with a slightly menacing
+geniality, "you'll be settling down to serious work. You won't have time
+for generalities. Meanwhile ..."
+
+Meanwhile, it was a privilege. Straight from the horse's mouth into the
+notebook. The boys scribbled like mad.
+
+Tall and rather thin but upright, the Director advanced into the room.
+He had a long chin and big rather prominent teeth, just covered, when
+he was not talking, by his full, floridly curved lips. Old, young? Thirty?
+Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
+arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
+
+"I shall begin at the beginning," said the D.H.C. and the more zealous
+students recorded his intention in their notebooks: Begin at the begin-
+ning. "These," he waved his hand, "are the incubators." And opening
+an insulated door he showed them racks upon racks of numbered test-
+tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
+whereas the male gametes," and here he opened another door, "they
+have to be kept at thirty-five instead of thirty-seven. Full blood heat
+sterilizes." Rams wrapped in theremogene beget no lambs.
+
+Still leaning against the incubators he gave them, while the pencils
+scurried illegibly across the pages, a brief description of the modern
+
+
+
+fertilizing process; spoke first, of course, of its surgical introduc-
+tion-"the operation undergone voluntarily for the good of Society, not
+to mention the fact that it carries a bonus amounting to six months'
+salary"; continued with some account of the technique for preserving
+the excised ovary alive and actively developing; passed on to a consid-
+eration of optimum temperature, salinity, viscosity; referred to the liq-
+uor in which the detached and ripened eggs were kept; and, leading
+his charges to the work tables, actually showed them how this liquor
+was drawn off from the test-tubes; how it was let out drop by drop
+onto the specially warmed slides of the microscopes; how the eggs
+which it contained were inspected for abnormalities, counted and
+transferred to a porous receptacle; how (and he now took them to
+watch the operation) this receptacle was immersed in a warm bouillon
+containing free-swimming spermatozoa-at a minimum concentration
+of one hundred thousand per cubic centimetre, he insisted; and how,
+after ten minutes, the container was lifted out of the liquor and its
+contents re-examined; how, if any of the eggs remained unfertilized, it
+was again immersed, and, if necessary, yet again; how the fertilized
+ova went back to the incubators; where the Alphas and Betas re-
+mained until definitely bottled; while the Gammas, Deltas and Epsilons
+were brought out again, after only thirty-six hours, to undergo Bo-
+kanovsky's Process.
+
+"Bokanovsky's Process," repeated the Director, and the students un-
+derlined the words in their little notebooks.
+
+One egg, one embryo, one adult-normality. But a bokanovskified egg
+will bud, will proliferate, will divide. From eight to ninety-six buds, and
+every bud will grow into a perfectly formed embryo, and every embryo
+into a full-sized adult. Making ninety-six human beings grow where
+only one grew before. Progress.
+
+"Essentially," the D.H.C. concluded, "bokanovskification consists of a
+series of arrests of development. We check the normal growth and,
+paradoxically enough, the egg responds by budding."
+
+Responds by budding. The pencils were busy.
+
+He pointed. On a very slowly moving band a rack-full of test-tubes was
+entering a large metal box, another, rack-full was emerging. Machinery
+faintly purred. It took eight minutes for the tubes to go through, he
+
+
+
+told them. Eight minutes of hard X-rays being about as much as an
+egg can stand. A few died; of the rest, the least susceptible divided
+into two; most put out four buds; some eight; all were returned to the
+incubators, where the buds began to develop; then, after two days,
+were suddenly chilled, chilled and checked. Two, four, eight, the buds
+in their turn budded; and having budded were dosed almost to death
+with alcohol; consequently burgeoned again and having budded-bud
+out of bud out of bud-were thereafter-further arrest being generally
+fatal-left to develop in peace. By which time the original egg was in a
+fair way to becoming anything from eight to ninety-six embryos- a
+prodigious improvement, you will agree, on nature. Identical twins-but
+not in piddling twos and threes as in the old viviparous days, when an
+egg would sometimes accidentally divide; actually by dozens, by
+scores at a time.
+
+"Scores," the Director repeated and flung out his arms, as though he
+were distributing largesse. "Scores."
+
+But one of the students was fool enough to ask where the advantage
+lay.
+
+"My good boy!" The Director wheeled sharply round on him. "Can't you
+see? Can't you see?" He raised a hand; his expression was solemn.
+"Bokanovsky's Process is one of the major instruments of social stabil-
+ity!"
+
+Major instruments of social stability.
+
+Standard men and women; in uniform batches. The whole of a small
+factory staffed with the products of a single bokanovskified egg.
+
+"Ninety-six identical twins working ninety-six identical machines!" The
+voice was almost tremulous with enthusiasm. "You really know where
+you are. For the first time in history." He quoted the planetary motto.
+"Community, Identity, Stability." Grand words. "If we could bo-
+kanovskify indefinitely the whole problem would be solved."
+
+Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
+lions of identical twins. The principle of mass production at last applied
+to biology.
+
+
+
+"But, alas," the Director shook his head, "we can't bokanovskify indefi-
+nitely."
+
+Ninety-six seemed to be the limit; seventy-two a good average. From
+the same ovary and with gametes of the same male to manufacture as
+many batches of identical twins as possible-that was the best (sadly a
+second best) that they could do. And even that was difficult.
+
+"For in nature it takes thirty years for two hundred eggs to reach ma-
+turity. But our business is to stabilize the population at this moment,
+here and now. Dribbling out twins over a quarter of a century-what
+would be the use of that?"
+
+Obviously, no use at all. But Podsnap's Technique had immensely ac-
+celerated the process of ripening. They could make sure of at least a
+hundred and fifty mature eggs within two years. Fertilize and bo-
+kanovskify-in other words, multiply by seventy-two-and you get an
+average of nearly eleven thousand brothers and sisters in a hundred
+and fifty batches of identical twins, all within two years of the same
+age.
+
+"And in exceptional cases we can make one ovary yield us over fifteen
+thousand adult individuals."
+
+Beckoning to a fair-haired, ruddy young man who happened to be
+passing at the moment. "Mr. Foster," he called. The ruddy young man
+approached. "Can you tell us the record for a single ovary, Mr. Foster?"
+
+"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
+out hesitation. He spoke very quickly, had a vivacious blue eye, and
+took an evident pleasure in quoting figures. "Sixteen thousand and
+twelve; in one hundred and eighty-nine batches of identicals. But of
+course they've done much better," he rattled on, "in some of the tropi-
+cal Centres. Singapore has often produced over sixteen thousand five
+hundred; and Mombasa has actually touched the seventeen thousand
+mark. But then they have unfair advantages. You should see the way a
+negro ovary responds to pituitary! It's quite astonishing, when you're
+used to working with European material. Still," he added, with a laugh
+(but the light of combat was in his eyes and the lift of his chin was
+challenging), "still, we mean to beat them if we can. I'm working on a
+wonderful Delta-Minus ovary at this moment. Only just eighteen
+
+
+
+months old. Over twelve thousand seven hundred children already, ei-
+ther decanted or in embryo. And still going strong. We'll beat them
+yet."
+
+"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
+the shoulder. "Come along with us, and give these boys the benefit of
+your expert knowledge."
+
+Mr. Foster smiled modestly. "With pleasure." They went.
+In the Bottling Room all was harmonious bustle and ordered activity.
+Flaps of fresh sow's peritoneum ready cut to the proper size came
+shooting up in little lifts from the Organ Store in the sub-basement.
+Whizz and then, click! the lift-hatches hew open; the bottle-liner had
+only to reach out a hand, take the flap, insert, smooth-down, and be-
+fore the lined bottle had had time to travel out of reach along the end-
+less band, whizz, click! another flap of peritoneum had shot up from
+the depths, ready to be slipped into yet another bottle, the next of that
+slow interminable procession on the band.
+
+Next to the Liners stood the Matriculators. The procession advanced;
+one by one the eggs were transferred from their test-tubes to the
+larger containers; deftly the peritoneal lining was slit, the morula
+dropped into place, the saline solution poured in ... and already the
+bottle had passed, and it was the turn of the labellers. Heredity, date
+of fertilization, membership of Bokanovsky Group-details were trans-
+ferred from test-tube to bottle. No longer anonymous, but named,
+identified, the procession marched slowly on; on through an opening in
+the wall, slowly on into the Social Predestination Room.
+"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
 as they entered."""
 
 
diff --git a/hubconf.py b/hubconf.py
index f8d0d1a84..4e5c1b4b0 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -14,7 +14,7 @@ dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece",
 
 @add_start_docstrings(AutoConfig.__doc__)
 def config(*args, **kwargs):
-    r""" 
+    r"""
                 # Using torch.hub !
                 import torch
 
@@ -34,7 +34,7 @@ def config(*args, **kwargs):
 
 @add_start_docstrings(AutoTokenizer.__doc__)
 def tokenizer(*args, **kwargs):
-    r""" 
+    r"""
         # Using torch.hub !
         import torch
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 3e8f51bfd..d5f6c1e74 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -216,7 +216,7 @@ XXX_START_DOCSTRING = r"""    The XXX model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -230,13 +230,13 @@ XXX_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 4ea3cca8c..e4e72c97f 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -198,7 +198,7 @@ XXX_START_DOCSTRING = r"""    The XXX model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -212,13 +212,13 @@ XXX_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -670,9 +670,9 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
         input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
         input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
         print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
         # a nice puppet
 
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index c6edda280..6800920cf 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -49,11 +49,11 @@ class LoginCommand(BaseUserCommand):
     def run(self):
         print(
             """
-        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
-        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
-        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
-        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
-        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
+        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
+        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
+        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
+        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
+        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
 
         """
         )
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index d47211a0d..c6581374c 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -281,7 +281,7 @@ def squad_convert_examples_to_features(
         processor = SquadV2Processor()
         examples = processor.get_dev_examples(data_dir)
 
-        features = squad_convert_examples_to_features( 
+        features = squad_convert_examples_to_features(
             examples=examples,
             tokenizer=tokenizer,
             max_seq_length=args.max_seq_length,
@@ -640,8 +640,8 @@ class SquadFeatures(object):
             has more information related to that token and should be prioritized over this feature for that token.
         tokens: list of tokens corresponding to the input ids
         token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
-        start_position: start of the answer token index 
-        end_position: end of the answer token index 
+        start_position: start of the answer token index
+        end_position: end of the answer token index
     """
 
     def __init__(
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index 7ff200084..669d8fc35 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -396,7 +396,7 @@ ALBERT_START_DOCSTRING = r"""    The ALBERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -410,13 +410,13 @@ ALBERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -796,9 +796,9 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
         input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
         input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
         print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
         # a nice puppet
 
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index bcdde45bd..2df74130e 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -864,7 +864,7 @@ class AutoModelForTokenClassification:
     def from_config(cls, config):
         r""" Instantiates one of the base model classes of the library
         from a configuration.
-    
+
             config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
                     - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
@@ -874,7 +874,7 @@ class AutoModelForTokenClassification:
                     - isInstance of `roberta` configuration class: RobertaModel (Roberta model)
 
         Examples::
-    
+
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
             model = AutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py
index 363399ee5..d38751a1d 100644
--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -40,9 +40,9 @@ CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 CAMEMBERT_START_DOCSTRING = r"""    The CamemBERT model was proposed in
     `CamemBERT: a Tasty French Language Model`_
     by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019.
-    
+
     It is a model trained on 138GB of French text.
-    
+
     This implementation is the same as RoBERTa.
 
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
@@ -55,7 +55,7 @@ CAMEMBERT_START_DOCSTRING = r"""    The CamemBERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -74,7 +74,7 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
 
                 ``tokens:         <s> the dog is hairy . </s>``
 
-            Fully encoded sequences or sequence pairs can be obtained using the CamembertTokenizer.encode function with 
+            Fully encoded sequences or sequence pairs can be obtained using the CamembertTokenizer.encode function with
             the ``add_special_tokens`` parameter set to ``True``.
 
             CamemBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -199,7 +199,7 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
 
 
 @add_start_docstrings(
-    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 
+    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
     on top of the pooled output) e.g. for GLUE tasks. """,
     CAMEMBERT_START_DOCSTRING,
     CAMEMBERT_INPUTS_DOCSTRING,
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index f34189170..193a3d7d1 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -192,7 +192,7 @@ class CTRLPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
 
 
-CTRL_START_DOCSTRING = r"""    CTRL model was proposed in 
+CTRL_START_DOCSTRING = r"""    CTRL model was proposed in
     `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
     by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
     It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
@@ -224,7 +224,7 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
         **past**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model 
+            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
@@ -261,7 +261,7 @@ class CTRLModel(CTRLPreTrainedModel):
         **past**:
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
@@ -464,7 +464,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         **past**:
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 5fef44384..aa732b31e 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -366,12 +366,12 @@ DISTILBERT_START_DOCSTRING = r"""
 
     For more information on DistilBERT, please refer to our
     `detailed blog post`_
-    
+
     .. _`detailed blog post`:
         https://medium.com/huggingface/distilbert-8cf3380435b5
 
     Parameters:
-        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -381,7 +381,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
         **input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
-            
+
             For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
         **attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 94e977e62..1e5c54c95 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -304,7 +304,7 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
         **past**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model 
+            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
@@ -341,7 +341,7 @@ class GPT2Model(GPT2PreTrainedModel):
         **past**:
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
@@ -532,7 +532,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         **past**:
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
@@ -640,7 +640,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         **past**:
             list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
             that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model 
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
@@ -654,15 +654,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
         import torch
         from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
-        
+
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-        
+
         # Add a [CLS] to the vocabulary (we should train it also!)
         tokenizer.add_special_tokens({'cls_token': '[CLS]'})
         model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
         print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-        
+
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         encoded_choices = [tokenizer.encode(s) for s in choices]
         cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
diff --git a/transformers/modeling_mmbt.py b/transformers/modeling_mmbt.py
index 2c22a409b..490969fc3 100644
--- a/transformers/modeling_mmbt.py
+++ b/transformers/modeling_mmbt.py
@@ -75,10 +75,10 @@ class ModalEmbeddings(nn.Module):
         return embeddings
 
 
-MMBT_START_DOCSTRING = r"""    MMBT model was proposed in 
+MMBT_START_DOCSTRING = r"""    MMBT model was proposed in
     `Supervised Multimodal Bitransformers for Classifying Images and Text`_
     by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, 
+    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders,
     and obtain state-of-the-art performance on various multimodal classification benchmark tasks.
 
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
@@ -93,15 +93,15 @@ MMBT_START_DOCSTRING = r"""    MMBT model was proposed in
     Parameters:
         config (:class:`~transformers.MMBTConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-        transformer (:class: `~nn.Module`): A text transformer that is used by MMBT. 
+        transformer (:class: `~nn.Module`): A text transformer that is used by MMBT.
             It should have embeddings, encoder, and pooler attributes.
-        encoder (:class: `~nn.Module`): Encoder for the second modality. 
+        encoder (:class: `~nn.Module`): Encoder for the second modality.
             It should take in a batch of modal inputs and return k, n dimension embeddings.
 """
 
 MMBT_INPUTS_DOCSTRING = r"""    Inputs:
         **input_modal**: ``torch.FloatTensor`` of shape ``(batch_size, ***)``:
-            The other modality data. It will be the shape that the encoder for that type expects. 
+            The other modality data. It will be the shape that the encoder for that type expects.
             e.g. With an Image Encoder, the shape would be (batch_size, channels, height, width)
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
@@ -119,7 +119,7 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Segment token indices to indicate different portions of the inputs.
         **modal_token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
-            Segment token indices to indicate different portions of the non-text modality. 
+            Segment token indices to indicate different portions of the non-text modality.
             The embeddings from these tokens will be summed with the respective token embeddings for the non-text modality.
         **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of positions of each input sequence tokens in the position embeddings.
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index f6233061a..d4cad0d0d 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -97,11 +97,11 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
     `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
     by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
     Veselin Stoyanov. It is based on Google's BERT model released in 2018.
-    
+
     It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
     objective and training with much larger mini-batches and learning rates.
-    
-    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
+
+    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained
     models.
 
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
@@ -114,7 +114,7 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -133,7 +133,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 
                 ``tokens:         <s> the dog is hairy . </s>``
 
-            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
+            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
             the ``add_special_tokens`` parameter set to ``True``.
 
             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -319,7 +319,7 @@ class RobertaLMHead(nn.Module):
 
 
 @add_start_docstrings(
-    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
     on top of the pooled output) e.g. for GLUE tasks. """,
     ROBERTA_START_DOCSTRING,
     ROBERTA_INPUTS_DOCSTRING,
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 1467a0cd2..b67f01228 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -661,7 +661,7 @@ T5_START_DOCSTRING = r"""    The T5 model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index de6ef405c..e921dc8ca 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -510,7 +510,7 @@ ALBERT_START_DOCSTRING = r"""    The ALBERT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -524,13 +524,13 @@ ALBERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index a3a22040d..6f878c549 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -356,7 +356,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "transformer"
 
 
-CTRL_START_DOCSTRING = r"""    CTRL model was proposed in 
+CTRL_START_DOCSTRING = r"""    CTRL model was proposed in
     `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
     by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
     It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 98317488b..297d7edb1 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -109,7 +109,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
                 linear tensor, float32 with shape [batch_size, length, vocab_size].
         Raises:
             ValueError: if mode is not valid.
-        
+
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
@@ -487,7 +487,7 @@ DISTILBERT_START_DOCSTRING = r"""
 
     For more information on DistilBERT, please refer to our
     `detailed blog post`_
-    
+
     This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
     refer to the TF 2.0 documentation for all matter related to general usage and behavior.
 
@@ -514,7 +514,7 @@ DISTILBERT_START_DOCSTRING = r"""
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -524,7 +524,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
         **input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
-            
+
             For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index bf551991b..9ad049e9c 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -584,14 +584,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
-        
+
         # Add a [CLS] to the vocabulary (we should train it also!)
         # This option is currently not implemented in TF 2.0
         raise NotImplementedError
         tokenizer.add_special_tokens({'cls_token': '[CLS]'})
         model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
         print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-        
+
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         encoded_choices = [tokenizer.encode(s) for s in choices]
         cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index 44924b4f4..1fd2e961f 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -553,7 +553,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-        
+
         # Add a [CLS] to the vocabulary (we should train it also!)
         # This option is currently not implemented in TF 2.0
         raise NotImplementedError
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 5c40682c3..7aca11f5b 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -111,11 +111,11 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
     `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
     by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
     Veselin Stoyanov. It is based on Google's BERT model released in 2018.
-    
+
     It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
     objective and training with much larger mini-batches and learning rates.
-    
-    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
+
+    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained
     models.
 
     This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
@@ -144,7 +144,7 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -163,7 +163,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 
                 ``tokens:         <s> the dog is hairy . </s>``
 
-            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
+            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
             the ``add_special_tokens`` parameter set to ``True``.
 
             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -351,7 +351,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
 
 
 @add_start_docstrings(
-    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
     on top of the pooled output) e.g. for GLUE tasks. """,
     ROBERTA_START_DOCSTRING,
     ROBERTA_INPUTS_DOCSTRING,
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index e995bc3c9..5ab16ea43 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -565,7 +565,7 @@ T5_START_DOCSTRING = r"""    The T5 model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 637013b37..69ceaaf5a 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -139,7 +139,7 @@ class TFPreTrainedModel(tf.keras.Model):
         Arguments:
 
             new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
                 If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
 
         Return: ``tf.Variable``
@@ -431,7 +431,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
                 linear tensor, float32 with shape [batch_size, length, vocab_size].
         Raises:
             ValueError: if mode is not valid.
-        
+
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index 2127bbad3..ac45e3e20 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -825,7 +825,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
         **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
@@ -942,7 +942,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
         **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index adf7f2334..f20e8e300 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -45,7 +45,7 @@ XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
 XLM_ROBERTA_START_DOCSTRING = r"""    The XLM-RoBERTa model was proposed in
     `Unsupervised Cross-lingual Representation Learning at Scale`_
     by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
-    
+
     It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
 
     This implementation is the same as RoBERTa.
@@ -60,7 +60,7 @@ XLM_ROBERTA_START_DOCSTRING = r"""    The XLM-RoBERTa model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -79,7 +79,7 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
 
                 ``tokens:         <s> the dog is hairy . </s>``
 
-            Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with 
+            Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with
             the ``add_special_tokens`` parameter set to ``True``.
 
             XLM-RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -204,7 +204,7 @@ class XLMRobertaForMaskedLM(RobertaForMaskedLM):
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
     on top of the pooled output) e.g. for GLUE tasks. """,
     XLM_ROBERTA_START_DOCSTRING,
     XLM_ROBERTA_INPUTS_DOCSTRING,
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 6cc1bedd8..f0df6de60 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -868,7 +868,7 @@ class PreTrainedTokenizer(object):
                 padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
                 The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences   
+                - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
@@ -1073,7 +1073,7 @@ class PreTrainedTokenizer(object):
                 padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
                 The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences   
+                - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index 99eac9421..af1562937 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -2,7 +2,7 @@
 Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
 
 Note: for legal reasons, we are unable to host MRPC.
-You can either use the version hosted by the SentEval team, which is already tokenized, 
+You can either use the version hosted by the SentEval team, which is already tokenized,
 or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
 For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
 You should then rename and place specific files in a folder (see below for an example).
-- 
GitLab


From f9c5317db29f66e0a10c7d070ef3a9dde10172de Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 17:57:20 +0100
Subject: [PATCH 12/32] Fix E265 flake8 warning (x1).

---
 examples/contrib/run_swag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 65c07c2a3..a996a2ac9 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -356,7 +356,7 @@ def train(args, train_dataset, model, tokenizer):
             inputs = {
                 "input_ids": batch[0],
                 "attention_mask": batch[1],
-                #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
                 "token_type_ids": batch[2],
                 "labels": batch[3],
             }
-- 
GitLab


From 357db7098c4431a7b28526b8de74c0fa08af0a9b Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 17:58:17 +0100
Subject: [PATCH 13/32] Fix E712 flake8 warning (x1).

---
 examples/run_multiple_choice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 19ca558ca..27b2e51e6 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -334,7 +334,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
         cached_mode = "test"
     else:
         cached_mode = "train"
-    assert (evaluate == True and test == True) == False
+    assert not (evaluate and test)
     cached_features_file = os.path.join(
         args.data_dir,
         "cached_{}_{}_{}_{}".format(
-- 
GitLab


From b1de7ae08a9674dc2fd4d69787c6a2a198971600 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 17:59:13 +0100
Subject: [PATCH 14/32] Fix F811 flake8 warning (x1).

---
 transformers/tests/modeling_xlnet_test.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index ac0e542cc..a0d950472 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -402,24 +402,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
             )
 
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids_1,
-                input_ids_2,
-                input_ids_q,
-                perm_mask,
-                input_mask,
-                target_mapping,
-                segment_ids,
-                lm_labels,
-                sequence_labels,
-                is_impossible_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
         def create_and_check_xlnet_sequence_classif(
             self,
             config,
-- 
GitLab


From eed46f38b77b83cdd3df4c20a1a12e1d20a31dac Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:01:03 +0100
Subject: [PATCH 15/32] Fix E302 flake8 warning (x3).

---
 templates/adding_a_new_model/modeling_tf_xxx.py | 1 +
 templates/adding_a_new_model/modeling_xxx.py    | 1 +
 transformers/modeling_t5.py                     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index d5f6c1e74..6478264ff 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -49,6 +49,7 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
     "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
 }
 
+
 ####################################################
 # TF 2.0 Models are constructed using Keras imperative API by sub-classing
 # - tf.keras.layers.Layer for the layers and
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index e4e72c97f..5a5f76b3d 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -49,6 +49,7 @@ XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
     "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
 }
 
+
 ####################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index b67f01228..a1024d47d 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -49,6 +49,7 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
     "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }
 
+
 ####################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-- 
GitLab


From 7dce8dc7ac061f81a0ba2062cf586db52cd1ffd8 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:01:54 +0100
Subject: [PATCH 16/32] Fix E731 flake8 warning (x3).

---
 examples/summarization/run_summarization.py | 5 ++++-
 transformers/commands/serving.py            | 5 ++++-
 transformers/modeling_utils.py              | 1 -
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 1917ca30b..4afa97b5a 100644
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -184,7 +184,10 @@ def save_rouge_scores(str_scores):
 def build_data_iterator(args, tokenizer):
     dataset = load_and_cache_examples(args, tokenizer)
     sampler = SequentialSampler(dataset)
-    collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
+
+    def collate_fn(data):
+        return collate(data, tokenizer, block_size=512, device=args.device)
+
     iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
 
     return iterator
diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py
index 5d48cc0b3..04dea67bf 100644
--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
@@ -15,7 +15,10 @@ try:
     _serve_dependancies_installed = True
 except (ImportError, AttributeError):
     BaseModel = object
-    Body = lambda *x, **y: None
+
+    def Body(*x, **y):
+        pass
+
     _serve_dependancies_installed = False
 
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 245a1afa0..79b7dafc1 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -77,7 +77,6 @@ class PreTrainedModel(nn.Module):
     """
     config_class = None
     pretrained_model_archive_map = {}
-    load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
 
     @property
-- 
GitLab


From 5eab3cf6bce7b6f11793056d8772aeb6e761ac4f Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:03:57 +0100
Subject: [PATCH 17/32] Fix W605 flake8 warning (x5).

---
 examples/contrib/run_openai_gpt.py | 4 ++--
 transformers/tokenization_xlm.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index e35f3d4fe..80331f340 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -22,8 +22,8 @@
           --model_name openai-gpt \
           --do_train \
           --do_eval \
-          --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
-          --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
+          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
+          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
           --output_dir ../log \
           --train_batch_size 16 \
 """
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 7ef53cf80..465162931 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -725,10 +725,10 @@ class XLMTokenizer(PreTrainedTokenizer):
             make && make install
             pip install kytea
             ```
-        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
+        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
             - Install with `pip install jieba`
 
-        \* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
+        (*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
         However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
         Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
         if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
-- 
GitLab


From fd2f17a7a1197529474c24551f1f1d8f534168a3 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:07:03 +0100
Subject: [PATCH 18/32] Fix E714 flake8 warning (x8).

---
 examples/summarization/modeling_bertabs.py      | 2 +-
 templates/adding_a_new_model/modeling_tf_xxx.py | 2 +-
 transformers/modeling_tf_albert.py              | 2 +-
 transformers/modeling_tf_bert.py                | 2 +-
 transformers/modeling_tf_gpt2.py                | 2 +-
 transformers/modeling_tf_openai.py              | 2 +-
 transformers/modeling_tf_t5.py                  | 2 +-
 transformers/modeling_tf_transfo_xl.py          | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index e8087f300..22e50b5e7 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -519,7 +519,7 @@ class MultiHeadedAttention(nn.Module):
 
         attn = self.softmax(scores)
 
-        if not predefined_graph_1 is None:
+        if predefined_graph_1 is not None:
             attn_masked = attn[:, -1] * predefined_graph_1
             attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 6478264ff..7b576a65d 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -152,7 +152,7 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index e921dc8ca..7cc462143 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -686,7 +686,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 4f919eab2..1360b1951 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -562,7 +562,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index 9ad049e9c..85f9773e0 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -311,7 +311,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index 1fd2e961f..f7f98ecfd 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -303,7 +303,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 5ab16ea43..84767eb13 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -456,7 +456,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.num_hidden_layers
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index b6807d33d..b5618df38 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -554,7 +554,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
         # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
-        if not head_mask is None:
+        if head_mask is not None:
             raise NotImplementedError
         else:
             head_mask = [None] * self.n_layer
-- 
GitLab


From ea89bec185f7acaeb4b7e5c0ee1082e541becedd Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:13:15 +0100
Subject: [PATCH 19/32] Fix E231 flake8 warning (x9).

---
 transformers/hf_api.py                | 2 +-
 transformers/optimization_tf.py       | 4 +---
 transformers/pipelines.py             | 2 +-
 transformers/tests/model_card_test.py | 4 ++--
 transformers/tokenization_ctrl.py     | 4 ++--
 transformers/tokenization_openai.py   | 4 ++--
 6 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index f92c10df5..7380a784c 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -130,7 +130,7 @@ class HfApi:
             pf = TqdmProgressFileReader(f)
             data = f if pf.total_size > 0 else ""
 
-            r = requests.put(urls.write, data=data, headers={"content-type": urls.type,})
+            r = requests.put(urls.write, data=data, headers={"content-type": urls.type})
             r.raise_for_status()
             pf.close()
         return urls.access
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
index c2c8a3180..18c261f6c 100644
--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
@@ -158,9 +158,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
 
     def get_config(self):
         config = super(AdamWeightDecay, self).get_config()
-        config.update(
-            {"weight_decay_rate": self.weight_decay_rate,}
-        )
+        config.update({"weight_decay_rate": self.weight_decay_rate})
         return config
 
     def _do_use_weight_decay(self, param_name):
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index 3ddadc0cb..14cb4ac84 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -836,7 +836,7 @@ SUPPORTED_TASKS = {
         "tf": TFAutoModel if is_tf_available() else None,
         "pt": AutoModel if is_torch_available() else None,
         "default": {
-            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased",},
+            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased"},
             "config": None,
             "tokenizer": "distilbert-base-uncased",
         },
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
index 7a6f0721a..9b9947a72 100644
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -34,14 +34,14 @@ class ModelCardTester(unittest.TestCase):
             },
             "metrics": "BLEU and ROUGE-1",
             "evaluation_data": {
-                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1",},
+                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1"},
                 "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
             },
             "training_data": {
                 "Dataset": "English Wikipedia dump dated 2018-12-01",
                 "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf",
             },
-            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76,},
+            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76},
         }
 
     def test_model_card_common_properties(self):
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 5b401f91f..4c8d9c96b 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -33,8 +33,8 @@ VOCAB_FILES_NAMES = {
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",},
-    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",},
+    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
+    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index 4ea182c67..013d8cbc6 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -33,8 +33,8 @@ VOCAB_FILES_NAMES = {
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",},
-    "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",},
+    "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"},
+    "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-- 
GitLab


From b0f7db73cd2fd10142668d43fd30906a438f05f3 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:25:59 +0100
Subject: [PATCH 20/32] Fix E741 flake8 warning (x14).

---
 examples/contrib/run_swag.py                 |  6 +++---
 templates/adding_a_new_model/modeling_xxx.py | 18 ++++++++--------
 transformers/modeling_albert.py              | 18 ++++++++--------
 transformers/modeling_bert.py                | 18 ++++++++--------
 transformers/modeling_gpt2.py                | 18 ++++++++--------
 transformers/modeling_openai.py              | 16 +++++++-------
 transformers/modeling_t5.py                  | 22 ++++++++++----------
 transformers/tokenization_t5.py              |  4 ++--
 8 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index a996a2ac9..70dcca3b0 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -76,7 +76,7 @@ class SwagExample(object):
         return self.__repr__()
 
     def __repr__(self):
-        l = [
+        attributes = [
             "swag_id: {}".format(self.swag_id),
             "context_sentence: {}".format(self.context_sentence),
             "start_ending: {}".format(self.start_ending),
@@ -87,9 +87,9 @@ class SwagExample(object):
         ]
 
         if self.label is not None:
-            l.append("label: {}".format(self.label))
+            attributes.append("label: {}".format(self.label))
 
-        return ", ".join(l)
+        return ", ".join(attributes)
 
 
 class InputFeatures(object):
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 5a5f76b3d..0779b6521 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -89,25 +89,25 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
         pointer = model
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                l = re.split(r"_(\d+)", m_name)
+                scope_names = re.split(r"_(\d+)", m_name)
             else:
-                l = [m_name]
-            if l[0] == "kernel" or l[0] == "gamma":
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "output_bias" or l[0] == "beta":
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                 pointer = getattr(pointer, "bias")
-            elif l[0] == "output_weights":
+            elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "squad":
+            elif scope_names[0] == "squad":
                 pointer = getattr(pointer, "classifier")
             else:
                 try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
                     logger.info("Skipping {}".format("/".join(name)))
                     continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
         if m_name[-11:] == "_embeddings":
             pointer = getattr(pointer, "weight")
diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py
index 669d8fc35..5162a1d1d 100644
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -124,26 +124,26 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         pointer = model
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                l = re.split(r"_(\d+)", m_name)
+                scope_names = re.split(r"_(\d+)", m_name)
             else:
-                l = [m_name]
+                scope_names = [m_name]
 
-            if l[0] == "kernel" or l[0] == "gamma":
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "output_bias" or l[0] == "beta":
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                 pointer = getattr(pointer, "bias")
-            elif l[0] == "output_weights":
+            elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "squad":
+            elif scope_names[0] == "squad":
                 pointer = getattr(pointer, "classifier")
             else:
                 try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
                     logger.info("Skipping {}".format("/".join(name)))
                     continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
 
         if m_name[-11:] == "_embeddings":
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 9c6cccf71..e2e4f2b5f 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -93,25 +93,25 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         pointer = model
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                l = re.split(r"_(\d+)", m_name)
+                scope_names = re.split(r"_(\d+)", m_name)
             else:
-                l = [m_name]
-            if l[0] == "kernel" or l[0] == "gamma":
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "output_bias" or l[0] == "beta":
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                 pointer = getattr(pointer, "bias")
-            elif l[0] == "output_weights":
+            elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "squad":
+            elif scope_names[0] == "squad":
                 pointer = getattr(pointer, "classifier")
             else:
                 try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
                     logger.info("Skipping {}".format("/".join(name)))
                     continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
         if m_name[-11:] == "_embeddings":
             pointer = getattr(pointer, "weight")
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 1e5c54c95..ad086c451 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -77,20 +77,20 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
         pointer = model
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                l = re.split(r"(\d+)", m_name)
+                scope_names = re.split(r"(\d+)", m_name)
             else:
-                l = [m_name]
-            if l[0] == "w" or l[0] == "g":
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "b":
+            elif scope_names[0] == "b":
                 pointer = getattr(pointer, "bias")
-            elif l[0] == "wpe" or l[0] == "wte":
-                pointer = getattr(pointer, l[0])
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
                 pointer = getattr(pointer, "weight")
             else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
         try:
             assert pointer.shape == array.shape
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 3f37a4acf..66487755c 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -90,19 +90,19 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
         pointer = model
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                l = re.split(r"(\d+)", m_name)
+                scope_names = re.split(r"(\d+)", m_name)
             else:
-                l = [m_name]
-            if l[0] == "g":
+                scope_names = [m_name]
+            if scope_names[0] == "g":
                 pointer = getattr(pointer, "weight")
-            elif l[0] == "b":
+            elif scope_names[0] == "b":
                 pointer = getattr(pointer, "bias")
-            elif l[0] == "w":
+            elif scope_names[0] == "w":
                 pointer = getattr(pointer, "weight")
             else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
         try:
             assert pointer.shape == array.shape
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index a1024d47d..199ec7422 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -95,29 +95,29 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
         array = tf_weights[txt_name]
         for m_name in name:
             if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                l = re.split(r"_(\d+)", m_name)
+                scope_names = re.split(r"_(\d+)", m_name)
             else:
-                l = [m_name]
-            if l[0] in ["kernel", "scale", "embedding"]:
+                scope_names = [m_name]
+            if scope_names[0] in ["kernel", "scale", "embedding"]:
                 pointer = getattr(pointer, "weight")
-            # elif l[0] == 'scale':
+            # elif scope_names[0] == 'scale':
             #     pointer = getattr(pointer, 'weight')
-            # elif l[0] == 'output_bias' or l[0] == 'beta':
+            # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
             #     pointer = getattr(pointer, 'bias')
-            # elif l[0] == 'squad':
+            # elif scope_names[0] == 'squad':
             #     pointer = getattr(pointer, 'classifier')
             else:
                 try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
                     logger.info("Skipping {}".format("/".join(name)))
                     continue
-            if len(l) >= 2:
-                num = int(l[1])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
                 pointer = pointer[num]
-        if l[0] not in ["kernel", "scale", "embedding"]:
+        if scope_names[0] not in ["kernel", "scale", "embedding"]:
             pointer = getattr(pointer, "weight")
-        if l[0] != "embedding":
+        if scope_names[0] != "embedding":
             logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
             array = np.transpose(array)
         try:
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 8eb589cd1..e9921fef8 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -160,8 +160,8 @@ class T5Tokenizer(PreTrainedTokenizer):
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
         if token.startswith("<extra_id_"):
-            l = re.match(r"<extra_id_(\d+)>", token)
-            num = int(l.group(1))
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
             return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
 
-- 
GitLab


From 631be27078fe394fdd8f98b9475ca87026f8044d Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 20:22:05 +0100
Subject: [PATCH 21/32] Fix E722 flake8 warnings (x26).

---
 examples/contrib/run_swag.py                      |  2 +-
 examples/distillation/distiller.py                |  2 +-
 examples/distillation/run_squad_w_distillation.py |  2 +-
 examples/mm-imdb/run_mmimdb.py                    |  2 +-
 examples/pplm/run_pplm.py                         |  4 ++--
 examples/pplm/run_pplm_discrim_train.py           |  8 ++++----
 examples/run_glue.py                              |  2 +-
 examples/run_lm_finetuning.py                     |  2 +-
 examples/run_multiple_choice.py                   |  2 +-
 examples/run_squad.py                             |  2 +-
 examples/run_xnli.py                              |  2 +-
 templates/adding_a_new_example_script/run_xxx.py  |  2 +-
 transformers/__init__.py                          |  6 +++---
 transformers/hf_api.py                            | 10 ++++------
 transformers/modeling_utils.py                    |  2 +-
 transformers/tests/modeling_tf_common_test.py     |  6 +++---
 transformers/tokenization_ctrl.py                 |  7 ++++---
 transformers/tokenization_gpt2.py                 |  7 ++++---
 transformers/tokenization_openai.py               |  7 ++++---
 transformers/tokenization_xlm.py                  |  7 ++++---
 20 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 70dcca3b0..bfa1cd166 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -44,7 +44,7 @@ from transformers import (
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index a957b1a09..16d73ece3 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -37,7 +37,7 @@ from utils import logger
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index 11524e388..ca7341968 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -67,7 +67,7 @@ from ..utils_squad_evaluate import main as evaluate_on_squad
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index e87555f7d..24ad82190 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -62,7 +62,7 @@ from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_trans
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py
index ec848323e..8c405b56a 100644
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -697,8 +697,8 @@ def run_pplm_example(
             print("= Perturbed generated text {} =".format(i + 1))
             print(pert_gen_text)
             print()
-        except:
-            pass
+        except Exception as exc:
+            print("Ignoring error while generating perturbed text:", exc)
 
         # keep the prefix, perturbed seq, original seq for each index
         generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py
index 287715e53..1c21c56c8 100644
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -285,7 +285,7 @@ def train_discriminator(
             for i, line in enumerate(f):
                 try:
                     data.append(eval(line))
-                except:
+                except Exception:
                     print("Error evaluating line {}: {}".format(i, line))
                     continue
         x = []
@@ -303,7 +303,7 @@ def train_discriminator(
                         continue
                     x.append(seq)
                     y.append(d["label"])
-                except:
+                except Exception:
                     print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
@@ -343,7 +343,7 @@ def train_discriminator(
                         continue
                     x.append(seq)
                     y.append(int(np.sum(d["label"]) > 0))
-                except:
+                except Exception:
                     print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
                     pass
 
@@ -402,7 +402,7 @@ def train_discriminator(
                         x.append(seq)
                         y.append(class2idx[label])
 
-                    except:
+                    except Exception:
                         print("Error tokenizing line {}, skipping it".format(i))
                         pass
 
diff --git a/examples/run_glue.py b/examples/run_glue.py
index d70e20f33..f98190059 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -64,7 +64,7 @@ from transformers import glue_processors as processors
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index f916897d0..ae3d68dad 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -63,7 +63,7 @@ from transformers import (
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 27b2e51e6..82f5a7ee4 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -48,7 +48,7 @@ from utils_multiple_choice import convert_examples_to_features, processors
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 9dbc39cb7..acf4d0eec 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -64,7 +64,7 @@ from transformers.data.processors.squad import SquadResult, SquadV1Processor, Sq
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index bc1789f1d..f772bb5cb 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -52,7 +52,7 @@ from transformers import xnli_processors as processors
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index e7e95ede6..eec29b59b 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -63,7 +63,7 @@ from utils_squad_evaluate import main as evaluate_on_squad
 
 try:
     from torch.utils.tensorboard import SummaryWriter
-except:
+except ImportError:
     from tensorboardX import SummaryWriter
 
 
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 8e52771f2..f5f961a9e 100755
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -6,12 +6,12 @@ __version__ = "2.3.0"
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
     import absl.logging
-
+except ImportError:
+    pass
+else:
     absl.logging.set_verbosity("info")
     absl.logging.set_stderrthreshold("info")
     absl.logging._warn_preinit_stderr = False
-except:
-    pass
 
 import logging
 
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 7380a784c..b1c765083 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -205,10 +205,8 @@ class HfFolder:
         try:
             with open(cls.path_token, "r") as f:
                 return f.read()
-        except:
-            # this is too wide. When Py2 is dead use:
-            # `except FileNotFoundError:` instead
-            return None
+        except FileNotFoundError:
+            pass
 
     @classmethod
     def delete_token(cls):
@@ -218,5 +216,5 @@ class HfFolder:
         """
         try:
             os.remove(cls.path_token)
-        except:
-            return
+        except FileNotFoundError:
+            pass
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 79b7dafc1..d09a4880b 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -439,7 +439,7 @@ class PreTrainedModel(nn.Module):
         if state_dict is None and not from_tf:
             try:
                 state_dict = torch.load(resolved_archive_file, map_location="cpu")
-            except:
+            except Exception:
                 raise OSError(
                     "Unable to load weights from pytorch checkpoint file. "
                     "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 6f2d62cc9..ac43e7e87 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -333,13 +333,13 @@ class TFCommonTestCases:
             # We used to fall back to just synthetically creating a dummy tensor of ones:
             try:
                 x = wte(input_ids, mode="embedding")
-            except:
+            except Exception:
                 try:
                     x = wte([input_ids], mode="embedding")
-                except:
+                except Exception:
                     try:
                         x = wte([input_ids, None, None, None], mode="embedding")
-                    except:
+                    except Exception:
                         if hasattr(self.model_tester, "embedding_size"):
                             x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
                         else:
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 4c8d9c96b..24036b422 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -168,11 +168,12 @@ class CTRLTokenizer(PreTrainedTokenizer):
             while i < len(word):
                 try:
                     j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
+                except ValueError:
                     new_word.extend(word[i:])
                     break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
 
                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                     new_word.append(first + second)
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 06da88850..c8f97f052 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -178,11 +178,12 @@ class GPT2Tokenizer(PreTrainedTokenizer):
             while i < len(word):
                 try:
                     j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
+                except ValueError:
                     new_word.extend(word[i:])
                     break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
 
                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                     new_word.append(first + second)
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index 013d8cbc6..9c4c48548 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -136,11 +136,12 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             while i < len(word):
                 try:
                     j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
+                except ValueError:
                     new_word.extend(word[i:])
                     break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
 
                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                     new_word.append(first + second)
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 465162931..9d315f880 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -683,11 +683,12 @@ class XLMTokenizer(PreTrainedTokenizer):
             while i < len(word):
                 try:
                     j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
+                except ValueError:
                     new_word.extend(word[i:])
                     break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
 
                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                     new_word.append(first + second)
-- 
GitLab


From 2ab78325f079781009d5300cc85ce7f6af6c2576 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 20:53:17 +0100
Subject: [PATCH 22/32] Fix F821 flake8 warning (x47).

Ignore warnings related to Python 2, because it's going away soon.
---
 examples/contrib/run_swag.py                   |  2 +-
 examples/run_generation.py                     |  2 +-
 examples/utils_multiple_choice.py              |  2 +-
 .../adding_a_new_model/modeling_tf_xxx.py      | 11 +++++++++++
 templates/adding_a_new_model/modeling_xxx.py   | 18 ++++++++++++++++++
 transformers/commands/user.py                  |  4 ++--
 transformers/data/processors/utils.py          |  2 +-
 transformers/file_utils.py                     |  2 +-
 transformers/hf_api.py                         | 11 +++++------
 transformers/modeling_bert.py                  |  8 ++++++--
 transformers/modeling_tf_albert.py             |  8 ++++++--
 transformers/modeling_tf_auto.py               |  4 ++--
 transformers/modeling_tf_bert.py               |  8 ++++++--
 transformers/modeling_tf_xlnet.py              |  4 ++--
 transformers/modeling_xlnet.py                 |  2 +-
 transformers/tests/tokenization_utils_test.py  |  2 +-
 transformers/tokenization_albert.py            |  2 +-
 transformers/tokenization_gpt2.py              |  2 +-
 transformers/tokenization_transfo_xl.py        |  8 ++++----
 transformers/tokenization_utils.py             | 12 ++++++------
 transformers/tokenization_xlnet.py             |  2 +-
 21 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index bfa1cd166..7d1a9e8e8 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -108,7 +108,7 @@ def read_swag_examples(input_file, is_training=True):
         lines = []
         for line in reader:
             if sys.version_info[0] == 2:
-                line = list(unicode(cell, "utf-8") for cell in line)
+                line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
             lines.append(line)
 
     if is_training and lines[0][-1] != "label":
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 629b9348a..531c48532 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -225,7 +225,7 @@ def main():
     # Batch size == 1. to add more examples please use num_return_sequences > 1
     generated_sequence = output_sequences[0].tolist()
     text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
-    text = text[: t.find(args.stop_token) if args.stop_token else None]
+    text = text[: text.find(args.stop_token) if args.stop_token else None]
 
     print(text)
 
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 987ffbc0e..1eea8f335 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -184,7 +184,7 @@ class SwagProcessor(DataProcessor):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, "utf-8") for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
                 lines.append(line)
             return lines
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 7b576a65d..edebd8ab0 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -68,6 +68,14 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+
+TFXxxAttention = tf.keras.layers.Layer
+
+TFXxxIntermediate = tf.keras.layers.Layer
+
+TFXxxOutput = tf.keras.layers.Layer
+
+
 class TFXxxLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXxxLayer, self).__init__(**kwargs)
@@ -316,6 +324,9 @@ class TFXxxModel(TFXxxPreTrainedModel):
         return outputs
 
 
+TFXxxMLMHead = tf.keras.layers.Layer
+
+
 @add_start_docstrings(
     """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
 )
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 0779b6521..c4bcc55fd 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -135,6 +135,14 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+
+XxxAttention = nn.Module
+
+XxxIntermediate = nn.Module
+
+XxxOutput = nn.Module
+
+
 class XxxLayer(nn.Module):
     def __init__(self, config):
         super(XxxLayer, self).__init__()
@@ -160,6 +168,16 @@ class XxxLayer(nn.Module):
 # pointers for your model and the weights initialization
 # method if its not fully covered by PreTrainedModel's default method
 ####################################################
+
+XxxLayerNorm = torch.nn.LayerNorm
+
+XxxEmbeddings = nn.Module
+
+XxxEncoder = nn.Module
+
+XxxPooler = nn.Module
+
+
 class XxxPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index 6800920cf..65761ae98 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -1,6 +1,7 @@
 import os
 from argparse import ArgumentParser
 from getpass import getpass
+from typing import List, Union
 
 from transformers.commands import BaseTransformersCLICommand
 from transformers.hf_api import HfApi, HfFolder, HTTPError
@@ -96,8 +97,7 @@ class LogoutCommand(BaseUserCommand):
 
 
 class ListObjsCommand(BaseUserCommand):
-    def tabulate(self, rows, headers):
-        # type: (List[List[Union[str, int]]], List[str]) -> str
+    def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str:
         """
         Inspired by:
         stackoverflow.com/a/8356620/593036
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index 7e044438a..0ac98bf0f 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -102,7 +102,7 @@ class DataProcessor(object):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, "utf-8") for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
                 lines.append(line)
             return lines
 
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index b1a4d240d..2334ff06b 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -419,7 +419,7 @@ def get_from_cache(
                 with open(meta_path, "w") as meta_file:
                     output_string = json.dumps(meta)
                     if sys.version_info[0] == 2 and isinstance(output_string, str):
-                        output_string = unicode(output_string, "utf-8")  # The beauty of python 2
+                        output_string = unicode(output_string, "utf-8")  # noqa: F821
                     meta_file.write(output_string)
 
     return cache_path
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index b1c765083..13469728d 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import io
 import os
 from os.path import expanduser
+from typing import List
 
 import requests
 import six
@@ -93,7 +95,7 @@ class HfApi:
         return d["user"]
 
     def logout(self, token):
-        # type: (...) -> void
+        # type: (...) -> None
         """
         Call HF API to log out.
         """
@@ -135,8 +137,7 @@ class HfApi:
             pf.close()
         return urls.access
 
-    def list_objs(self, token):
-        # type: (...) -> List[S3Obj]
+    def list_objs(self, token) -> List[S3Obj]:
         """
         Call HF API to list all stored files for user.
         """
@@ -156,9 +157,7 @@ class TqdmProgressFileReader:
     for implementation details.
     """
 
-    def __init__(
-        self, f  # type: io.BufferedReader
-    ):
+    def __init__(self, f: io.BufferedReader):
         self.f = f
         self.total_size = os.fstat(f.fileno()).st_size  # type: int
         self.pbar = tqdm(total=self.total_size, leave=False)
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index e2e4f2b5f..9b56bc45d 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -339,7 +339,9 @@ class BertIntermediate(nn.Module):
     def __init__(self, config):
         super(BertIntermediate, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
             self.intermediate_act_fn = config.hidden_act
@@ -459,7 +461,9 @@ class BertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super(BertPredictionHeadTransform, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index 7cc462143..ab9d14ab1 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -311,7 +311,9 @@ class TFAlbertLayer(tf.keras.layers.Layer):
             config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
         )
 
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
@@ -452,7 +454,9 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 9ce83fe4d..bd41893b6 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -690,9 +690,9 @@ class TFAutoModelForQuestionAnswering(object):
         elif isinstance(config, BertConfig):
             return TFBertForQuestionAnswering(config)
         elif isinstance(config, XLNetConfig):
-            return TFXLNetForQuestionAnswering(config)
+            raise NotImplementedError("TFXLNetForQuestionAnswering isn't implemented")
         elif isinstance(config, XLMConfig):
-            return TFXLMForQuestionAnswering(config)
+            raise NotImplementedError("TFXLMForQuestionAnswering isn't implemented")
         raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 1360b1951..f67901618 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -315,7 +315,9 @@ class TFBertIntermediate(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
             self.intermediate_act_fn = config.hidden_act
@@ -420,7 +422,9 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index e913a0513..9e48856a6 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -295,7 +295,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
         )
         self.dropout = tf.keras.layers.Dropout(config.dropout)
         if isinstance(config.ff_activation, str) or (
-            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)  # noqa: F821
         ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
@@ -483,7 +483,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             if dtype is not None and dtype != tf.float32:
                 fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
             if self.clamp_len > 0:
-                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len)
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
             pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
 
         return pos_emb
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 423ba8cb7..d749f1d12 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -431,7 +431,7 @@ class XLNetFeedForward(nn.Module):
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
         if isinstance(config.ff_activation, str) or (
-            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)  # noqa: F821
         ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index 76681b1af..886511066 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -35,7 +35,7 @@ class TokenizerUtilsTest(unittest.TestCase):
 
             for special_tok in tokenizer.all_special_tokens:
                 if six.PY2:
-                    self.assertIsInstance(special_tok, unicode)
+                    self.assertIsInstance(special_tok, unicode)  # noqa: F821
                 else:
                     self.assertIsInstance(special_tok, str)
                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 276a33cbf..699304bb5 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -156,7 +156,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):
+        if six.PY2 and isinstance(text, unicode):  # noqa: F821
             text = text.encode("utf-8")
 
         if not sample:
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index c8f97f052..6b2b85093 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -80,7 +80,7 @@ def bytes_to_unicode():
     This is a signficant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     """
-    _chr = unichr if sys.version_info[0] == 2 else chr
+    _chr = unichr if sys.version_info[0] == 2 else chr  # noqa: F821
     bs = (
         list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
     )
diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
index ce058580b..9f5dc63f6 100644
--- a/transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -36,10 +36,10 @@ try:
 except ImportError:
     pass
 
-# if sys.version_info[0] == 2:
-#     import cPickle as pickle
-# else:
-#     import pickle
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f0df6de60..8c60beb9d 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -252,10 +252,10 @@ class PreTrainedTokenizer(object):
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
                     assert isinstance(value, (list, tuple)) and all(
-                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
                     )
                 else:
-                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
                 setattr(self, key, value)
 
     @classmethod
@@ -567,7 +567,7 @@ class PreTrainedTokenizer(object):
 
         to_add_tokens = []
         for token in new_tokens:
-            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
+            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))  # noqa: F821
             if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                 token = token.lower()
             if (
@@ -650,11 +650,11 @@ class PreTrainedTokenizer(object):
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
             if key == "additional_special_tokens":
                 assert isinstance(value, (list, tuple)) and all(
-                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
                 )
                 added_tokens += self.add_tokens(value)
             else:
-                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
                 added_tokens += self.add_tokens([value])
             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
             setattr(self, key, value)
@@ -746,7 +746,7 @@ class PreTrainedTokenizer(object):
         if tokens is None:
             return None
 
-        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):  # noqa: F821
             return self._convert_token_to_id_with_added_voc(tokens)
 
         ids = []
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 6c016728e..ac41afb80 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -156,7 +156,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         """
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):
+        if six.PY2 and isinstance(text, unicode):  # noqa: F821
             text = text.encode("utf-8")
 
         if not sample:
-- 
GitLab


From fa2ccbc0817d6c0848555a8f44d475f13f49e26f Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 21:22:55 +0100
Subject: [PATCH 23/32] Fix E266 flake8 warning (x90).

---
 examples/contrib/run_swag.py                  |  4 ++--
 .../distillation/run_squad_w_distillation.py  |  4 ++--
 examples/distillation/scripts/extract.py      |  6 +++---
 examples/distillation/train.py                | 18 ++++++++---------
 examples/mm-imdb/run_mmimdb.py                |  4 ++--
 examples/run_bertology.py                     |  4 ++--
 examples/run_glue.py                          |  4 ++--
 examples/run_lm_finetuning.py                 |  4 ++--
 examples/run_multiple_choice.py               |  4 ++--
 examples/run_ner.py                           |  4 ++--
 examples/run_xnli.py                          |  4 ++--
 .../adding_a_new_example_script/run_xxx.py    |  4 ++--
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |  2 +-
 ...lbert_original_tf_checkpoint_to_pytorch.py |  2 +-
 ..._bert_original_tf_checkpoint_to_pytorch.py |  2 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |  2 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |  2 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |  2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 12 +++++------
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |  2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |  2 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |  2 +-
 transformers/modeling_distilbert.py           |  4 ++--
 transformers/modeling_tf_distilbert.py        |  4 ++--
 transformers/modeling_tf_pytorch_utils.py     |  6 ++++--
 transformers/modeling_tf_transfo_xl.py        | 20 +++++++++----------
 transformers/modeling_tf_xlnet.py             | 16 +++++++--------
 transformers/modeling_transfo_xl.py           | 20 +++++++++----------
 transformers/modeling_xlnet.py                | 16 +++++++--------
 transformers/optimization_tf.py               |  2 +-
 30 files changed, 92 insertions(+), 90 deletions(-)

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 7d1a9e8e8..bc6ff1497 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -487,7 +487,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
     )
@@ -520,7 +520,7 @@ def main():
         help="The output directory where the model checkpoints and predictions will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
index ca7341968..c046730c1 100644
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -430,7 +430,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
     )
@@ -486,7 +486,7 @@ def main():
         "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index 429350a77..f91b6d321 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -43,7 +43,7 @@ if __name__ == "__main__":
     state_dict = model.state_dict()
     compressed_sd = {}
 
-    ### Embeddings ###
+    # Embeddings #
     if args.model_type == "gpt2":
         for param_name in ["wte.weight", "wpe.weight"]:
             compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
@@ -55,7 +55,7 @@ if __name__ == "__main__":
             param_name = f"{prefix}.embeddings.LayerNorm.{w}"
             compressed_sd[param_name] = state_dict[param_name]
 
-    ### Transformer Blocks ###
+    # Transformer Blocks #
     std_idx = 0
     for teacher_idx in [0, 2, 4, 7, 9, 11]:
         if args.model_type == "gpt2":
@@ -82,7 +82,7 @@ if __name__ == "__main__":
                     ]
         std_idx += 1
 
-    ### Language Modeling Head ###s
+    # Language Modeling Head ###s
     if args.model_type == "roberta":
         for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
             compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index a37a7c427..670d03ea1 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -219,7 +219,7 @@ def main():
     args = parser.parse_args()
     sanity_checks(args)
 
-    ## ARGS ##
+    # ARGS #
     init_gpu_params(args)
     set_seed(args)
     if args.is_master:
@@ -236,7 +236,7 @@ def main():
             os.makedirs(args.dump_path)
         logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
 
-        ### SAVE PARAMS ###
+        # SAVE PARAMS #
         logger.info(f"Param: {args}")
         with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
             json.dump(vars(args), f, indent=4)
@@ -245,7 +245,7 @@ def main():
     student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
     teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
 
-    ### TOKENIZER ###
+    # TOKENIZER #
     tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
     special_tok_ids = {}
     for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
@@ -255,7 +255,7 @@ def main():
     args.special_tok_ids = special_tok_ids
     args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
 
-    ## DATA LOADER ##
+    # DATA LOADER #
     logger.info(f"Loading data from {args.data_file}")
     with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
@@ -275,7 +275,7 @@ def main():
     train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
     logger.info(f"Data loader created.")
 
-    ## STUDENT ##
+    # STUDENT #
     logger.info(f"Loading student config from {args.student_config}")
     stu_architecture_config = student_config_class.from_pretrained(args.student_config)
     stu_architecture_config.output_hidden_states = True
@@ -290,26 +290,26 @@ def main():
         student.to(f"cuda:{args.local_rank}")
     logger.info(f"Student loaded.")
 
-    ## TEACHER ##
+    # TEACHER #
     teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
     if args.n_gpu > 0:
         teacher.to(f"cuda:{args.local_rank}")
     logger.info(f"Teacher loaded from {args.teacher_name}.")
 
-    ## FREEZING ##
+    # FREEZING #
     if args.freeze_pos_embs:
         freeze_pos_embeddings(student, args)
     if args.freeze_token_type_embds:
         freeze_token_type_embeddings(student, args)
 
-    ## SANITY CHECKS ##
+    # SANITY CHECKS #
     assert student.config.vocab_size == teacher.config.vocab_size
     assert student.config.hidden_size == teacher.config.hidden_size
     assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
     if args.mlm:
         assert token_probs.size(0) == stu_architecture_config.vocab_size
 
-    ## DISTILLER ##
+    # DISTILLER #
     torch.cuda.empty_cache()
     distiller = Distiller(
         params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py
index 24ad82190..abea83bff 100644
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -344,7 +344,7 @@ def load_examples(args, tokenizer, evaluate=False):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -374,7 +374,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 27709fa7e..c3fe4b471 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -242,7 +242,7 @@ def prune_heads(args, model, eval_dataloader, head_mask):
 
 def main():
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -272,7 +272,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name",
         default="",
diff --git a/examples/run_glue.py b/examples/run_glue.py
index f98190059..fe5cc7e60 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -410,7 +410,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -447,7 +447,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index ae3d68dad..1fae12299 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -422,7 +422,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
     )
@@ -434,7 +434,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--eval_data_file",
         default=None,
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 82f5a7ee4..cb0ddb09a 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -385,7 +385,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -422,7 +422,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 8d991555a..7120c3736 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -385,7 +385,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -415,7 +415,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--labels",
         default="",
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index f772bb5cb..f550ca7c5 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -377,7 +377,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--data_dir",
         default=None,
@@ -417,7 +417,7 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
index eec29b59b..aa5c5ae4c 100644
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -401,7 +401,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 def main():
     parser = argparse.ArgumentParser()
 
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
     )
@@ -434,7 +434,7 @@ def main():
         help="The output directory where the model checkpoints and predictions will be written.",
     )
 
-    ## Other parameters
+    # Other parameters
     parser.add_argument(
         "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
     )
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index 2e6c47347..06aa4bf37 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
index bba3269a9..957379b5b 100644
--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index 87608f482..50695dedb 100755
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index 3aa895725..4f5bb0aa6 100755
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -51,7 +51,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index 25c2a0a00..d1d245dbe 100755
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -51,7 +51,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--openai_checkpoint_folder_path",
         default=None,
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index ba1dec53b..649481430 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -410,7 +410,7 @@ def convert_all_pt_checkpoints_to_tf(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file."
     )
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 5cf766b81..9044bc592 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -94,7 +94,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         layer: BertLayer = model.roberta.encoder.layer[i]
         roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
 
-        ### self attention
+        # self attention
         self_attn: BertSelfAttention = layer.attention.self
         assert (
             roberta_layer.self_attn.k_proj.weight.data.shape
@@ -110,7 +110,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
         self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
 
-        ### self-attention output
+        # self-attention output
         self_output: BertSelfOutput = layer.attention.output
         assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
         self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
@@ -118,20 +118,20 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
         self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
 
-        ### intermediate
+        # intermediate
         intermediate: BertIntermediate = layer.intermediate
         assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
         intermediate.dense.weight = roberta_layer.fc1.weight
         intermediate.dense.bias = roberta_layer.fc1.bias
 
-        ### output
+        # output
         bert_output: BertOutput = layer.output
         assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
         bert_output.dense.weight = roberta_layer.fc2.weight
         bert_output.dense.bias = roberta_layer.fc2.bias
         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        #### end of layer
+        # end of layer
 
     if classification_head:
         model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
@@ -170,7 +170,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
     )
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
index 853c9b717..94ba61f6e 100755
--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index ef98b76ab..30768fa96 100755
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -70,7 +70,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
     )
diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 37e93b7a1..5c6522449 100755
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -82,7 +82,7 @@ def convert_xlnet_checkpoint_to_pytorch(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
     parser.add_argument(
         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
     )
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index aa732b31e..18a96a887 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -47,7 +47,7 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 }
 
 
-### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
@@ -327,7 +327,7 @@ class Transformer(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
-### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class DistilBertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 297d7edb1..8692e3eba 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -42,7 +42,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 }
 
 
-### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
 def gelu(x):
     """ Gaussian Error Linear Unit.
     Original Implementation of the gelu activation function in Google Bert repo when initially created.
@@ -463,7 +463,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
         return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
 
 
-### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index f05b8aa4a..92ff8bf21 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -67,7 +67,8 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
 
 
 #####################
-### PyTorch => TF 2.0
+# PyTorch => TF 2.0 #
+#####################
 
 
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
@@ -197,7 +198,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
 
 
 #####################
-### TF 2.0 => PyTorch
+# TF 2.0 => PyTorch #
+#####################
 
 
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index b5618df38..068e9ac12 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -79,23 +79,23 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
 
     def call(self, inp, training=False):
         if self.pre_lnorm:
-            ##### layer normalization + positionwise feed-forward
+            # layer normalization + positionwise feed-forward
             core_out = self.layer_norm(inp)
             core_out = self.layer_1(core_out)
             core_out = self.drop_1(core_out, training=training)
             core_out = self.layer_2(core_out)
             core_out = self.drop_2(core_out, training=training)
 
-            ##### residual connection
+            # residual connection
             output = core_out + inp
         else:
-            ##### positionwise feed-forward
+            # positionwise feed-forward
             core_out = self.layer_1(inp)
             core_out = self.drop_1(core_out, training=training)
             core_out = self.layer_2(core_out)
             core_out = self.drop_2(core_out, training=training)
 
-            ##### residual connection + layer normalization
+            # residual connection + layer normalization
             output = self.layer_norm(inp + core_out)
 
         return output
@@ -206,7 +206,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
 
         r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
 
-        #### compute attention score
+        # compute attention score
         rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
         AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
 
@@ -218,7 +218,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         attn_score = AC + BD
         attn_score = attn_score * self.scale
 
-        #### compute attention probability
+        # compute attention probability
         if attn_mask is not None:
             attn_mask_t = attn_mask[:, :, None, None]
             attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
@@ -231,22 +231,22 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
         if head_mask is not None:
             attn_prob = attn_prob * head_mask
 
-        #### compute attention vector
+        # compute attention vector
         attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
 
         # [qlen x bsz x n_head x d_head]
         attn_vec_sizes = shape_list(attn_vec)
         attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
 
-        ##### linear projection
+        # linear projection
         attn_out = self.o_net(attn_vec)
         attn_out = self.drop(attn_out, training=training)
 
         if self.pre_lnorm:
-            ##### residual connection
+            # residual connection
             outputs = [w + attn_out]
         else:
-            ##### residual connection + layer normalization
+            # residual connection + layer normalization
             outputs = [self.layer_norm(w + attn_out)]
 
         if self.output_attentions:
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 9e48856a6..4bc8df2da 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -190,7 +190,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
         (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs
 
         if g is not None:
-            ###### Two-stream attention with relative positional encoding.
+            # Two-stream attention with relative positional encoding.
             # content based attention score
             if mems is not None and len(shape_list(mems)) > 1:
                 cat = tf.concat([mems, h], axis=0)
@@ -206,7 +206,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             # position-based key head
             k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
 
-            ##### h-stream
+            # h-stream
             # content-stream query head
             q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
 
@@ -221,7 +221,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
             # post processing
             output_h = self.post_attention([h, attn_vec_h], training=training)
 
-            ##### g-stream
+            # g-stream
             # query-stream query head
             q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q)
 
@@ -251,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                 attn_prob = attn_prob_h, attn_prob_g
 
         else:
-            ###### Multi-head attention with relative positional encoding
+            # Multi-head attention with relative positional encoding
             if mems is not None and len(shape_list(mems)) > 1:
                 cat = tf.concat([mems, h], axis=0)
             else:
@@ -552,7 +552,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
 
         dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32
 
-        ##### Attention mask
+        # Attention mask
         # causal attention mask
         if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
@@ -597,7 +597,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         else:
             non_tgt_mask = None
 
-        ##### Word embeddings and prepare h & g hidden states
+        # Word embeddings and prepare h & g hidden states
         if inputs_embeds is not None:
             word_emb_k = inputs_embeds
         else:
@@ -612,7 +612,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         else:
             output_g = None
 
-        ##### Segment embedding
+        # Segment embedding
         if token_type_ids is not None:
             # Convert `token_type_ids` to one-hot `seg_mat`
             mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
@@ -624,7 +624,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         else:
             seg_mat = None
 
-        ##### Positional encoding
+        # Positional encoding
         pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
         pos_emb = self.dropout(pos_emb, training=training)
 
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index a9040b53d..4ac524ee8 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -213,16 +213,16 @@ class PositionwiseFF(nn.Module):
 
     def forward(self, inp):
         if self.pre_lnorm:
-            ##### layer normalization + positionwise feed-forward
+            # layer normalization + positionwise feed-forward
             core_out = self.CoreNet(self.layer_norm(inp))
 
-            ##### residual connection
+            # residual connection
             output = core_out + inp
         else:
-            ##### positionwise feed-forward
+            # positionwise feed-forward
             core_out = self.CoreNet(inp)
 
-            ##### residual connection + layer normalization
+            # residual connection + layer normalization
             output = self.layer_norm(inp + core_out)
 
         return output
@@ -316,7 +316,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
 
         r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
 
-        #### compute attention score
+        # compute attention score
         rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
         AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
 
@@ -328,7 +328,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
         attn_score = AC + BD
         attn_score.mul_(self.scale)
 
-        #### compute attention probability
+        # compute attention probability
         if attn_mask is not None and torch.sum(attn_mask).item():
             attn_mask = attn_mask == 1  # Switch to bool
             if attn_mask.dim() == 2:
@@ -352,21 +352,21 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
         if head_mask is not None:
             attn_prob = attn_prob * head_mask
 
-        #### compute attention vector
+        # compute attention vector
         attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
 
         # [qlen x bsz x n_head x d_head]
         attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
 
-        ##### linear projection
+        # linear projection
         attn_out = self.o_net(attn_vec)
         attn_out = self.drop(attn_out)
 
         if self.pre_lnorm:
-            ##### residual connection
+            # residual connection
             outputs = [w + attn_out]
         else:
-            ##### residual connection + layer normalization
+            # residual connection + layer normalization
             outputs = [self.layer_norm(w + attn_out)]
 
         if self.output_attentions:
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index d749f1d12..f87e09a3b 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -330,7 +330,7 @@ class XLNetRelativeAttention(nn.Module):
 
     def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None):
         if g is not None:
-            ###### Two-stream attention with relative positional encoding.
+            # Two-stream attention with relative positional encoding.
             # content based attention score
             if mems is not None and mems.dim() > 1:
                 cat = torch.cat([mems, h], dim=0)
@@ -346,7 +346,7 @@ class XLNetRelativeAttention(nn.Module):
             # position-based key head
             k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
 
-            ##### h-stream
+            # h-stream
             # content-stream query head
             q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
 
@@ -361,7 +361,7 @@ class XLNetRelativeAttention(nn.Module):
             # post processing
             output_h = self.post_attention(h, attn_vec_h)
 
-            ##### g-stream
+            # g-stream
             # query-stream query head
             q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q)
 
@@ -391,7 +391,7 @@ class XLNetRelativeAttention(nn.Module):
                 attn_prob = attn_prob_h, attn_prob_g
 
         else:
-            ###### Multi-head attention with relative positional encoding
+            # Multi-head attention with relative positional encoding
             if mems is not None and mems.dim() > 1:
                 cat = torch.cat([mems, h], dim=0)
             else:
@@ -804,7 +804,7 @@ class XLNetModel(XLNetPreTrainedModel):
         dtype_float = next(self.parameters()).dtype
         device = next(self.parameters()).device
 
-        ##### Attention mask
+        # Attention mask
         # causal attention mask
         if self.attn_type == "uni":
             attn_mask = self.create_mask(qlen, mlen)
@@ -849,7 +849,7 @@ class XLNetModel(XLNetPreTrainedModel):
         else:
             non_tgt_mask = None
 
-        ##### Word embeddings and prepare h & g hidden states
+        # Word embeddings and prepare h & g hidden states
         if inputs_embeds is not None:
             word_emb_k = inputs_embeds
         else:
@@ -864,7 +864,7 @@ class XLNetModel(XLNetPreTrainedModel):
         else:
             output_g = None
 
-        ##### Segment embedding
+        # Segment embedding
         if token_type_ids is not None:
             # Convert `token_type_ids` to one-hot `seg_mat`
             if mlen > 0:
@@ -879,7 +879,7 @@ class XLNetModel(XLNetPreTrainedModel):
         else:
             seg_mat = None
 
-        ##### Positional encoding
+        # Positional encoding
         pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
         pos_emb = self.dropout(pos_emb)
 
diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py
index 18c261f6c..83eff902f 100644
--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
@@ -178,7 +178,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
         return True
 
 
-## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
 class GradientAccumulator(object):
     """Distribution strategies-aware gradient accumulation utility."""
 
-- 
GitLab


From 654e051e2a602b822e6804a6199bb7d0e841dfdc Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 21:31:37 +0100
Subject: [PATCH 24/32] Ignore F401 flake8 warning (x326 / 594).

---
 transformers/__init__.py                 | 4 ++++
 transformers/data/__init__.py            | 4 ++++
 transformers/data/processors/__init__.py | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index f5f961a9e..84a308d1c 100755
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -1,3 +1,7 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
 __version__ = "2.3.0"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index c0a3cbf4c..8d5f6b85b 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,3 +1,7 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
 from .metrics import is_sklearn_available
 from .processors import (
     DataProcessor,
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index dee7f438a..4cb37faf2 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,3 +1,7 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
 from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
 from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
 from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
-- 
GitLab


From 80327a13ea6181f564d937268b3da0bdab787b43 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 18:35:41 +0100
Subject: [PATCH 25/32] Fix F401 flake8 warning (x152 / 268).

This change is mostly autogenerated with:

    $ python -m autoflake --in-place --recursive examples templates transformers utils hubconf.py setup.py

I made minor changes in the generated diff.
---
 examples/contrib/run_camembert.py                          | 4 ----
 examples/run_bertology.py                                  | 2 +-
 examples/summarization/configuration_bertabs.py            | 2 --
 templates/adding_a_new_model/configuration_xxx.py          | 3 ---
 templates/adding_a_new_model/modeling_tf_xxx.py            | 6 ------
 templates/adding_a_new_model/modeling_xxx.py               | 5 -----
 templates/adding_a_new_model/tests/modeling_tf_xxx_test.py | 1 -
 transformers/configuration_bert.py                         | 3 ---
 transformers/configuration_ctrl.py                         | 3 ---
 transformers/configuration_distilbert.py                   | 3 ---
 transformers/configuration_gpt2.py                         | 3 ---
 transformers/configuration_openai.py                       | 3 ---
 transformers/configuration_t5.py                           | 3 ---
 transformers/configuration_transfo_xl.py                   | 3 ---
 transformers/configuration_xlm.py                          | 3 ---
 transformers/configuration_xlnet.py                        | 3 ---
 transformers/data/metrics/__init__.py                      | 2 --
 transformers/data/processors/squad.py                      | 1 -
 transformers/modeling_ctrl.py                              | 6 ------
 transformers/modeling_distilbert.py                        | 3 ---
 transformers/modeling_encoder_decoder.py                   | 1 -
 transformers/modeling_gpt2.py                              | 4 ----
 transformers/modeling_openai.py                            | 2 --
 transformers/modeling_t5.py                                | 3 ---
 transformers/modeling_tf_bert.py                           | 4 ----
 transformers/modeling_tf_ctrl.py                           | 3 ---
 transformers/modeling_tf_distilbert.py                     | 4 ----
 transformers/modeling_tf_gpt2.py                           | 6 ------
 transformers/modeling_tf_openai.py                         | 6 ------
 transformers/modeling_tf_transfo_xl.py                     | 6 ------
 transformers/modeling_tf_transfo_xl_utilities.py           | 1 -
 transformers/modeling_tf_xlm.py                            | 1 -
 transformers/modeling_tf_xlnet.py                          | 4 ----
 transformers/modeling_transfo_xl.py                        | 6 ------
 transformers/modeling_transfo_xl_utilities.py              | 1 -
 transformers/modeling_utils.py                             | 3 ---
 transformers/modeling_xlnet.py                             | 3 ---
 transformers/tests/configuration_common_test.py            | 1 -
 transformers/tests/modeling_auto_test.py                   | 1 -
 transformers/tests/modeling_common_test.py                 | 3 ---
 transformers/tests/modeling_ctrl_test.py                   | 1 -
 transformers/tests/modeling_tf_albert_test.py              | 1 -
 transformers/tests/modeling_tf_auto_test.py                | 1 -
 transformers/tests/modeling_tf_bert_test.py                | 1 -
 transformers/tests/modeling_tf_common_test.py              | 7 -------
 transformers/tests/modeling_tf_ctrl_test.py                | 1 -
 transformers/tests/modeling_tf_gpt2_test.py                | 1 -
 transformers/tests/modeling_tf_openai_gpt_test.py          | 1 -
 transformers/tests/modeling_tf_t5_test.py                  | 1 -
 transformers/tests/modeling_tf_xlnet_test.py               | 2 --
 transformers/tests/modeling_xlnet_test.py                  | 2 --
 transformers/tests/tokenization_auto_test.py               | 1 -
 transformers/tests/tokenization_bert_japanese_test.py      | 1 -
 transformers/tests/tokenization_distilbert_test.py         | 2 --
 transformers/tokenization_bert_japanese.py                 | 1 -
 transformers/tokenization_distilbert.py                    | 3 ---
 transformers/tokenization_roberta.py                       | 4 ----
 utils/download_glue_data.py                                | 2 --
 58 files changed, 1 insertion(+), 157 deletions(-)

diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
index 791a02fed..3da66d419 100644
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -1,7 +1,3 @@
-import tarfile
-import urllib.request
-from pathlib import Path
-
 import torch
 
 from transformers.modeling_camembert import CamembertForMaskedLM
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index c3fe4b471..184e6a2e3 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -22,7 +22,7 @@
 import argparse
 import logging
 import os
-from datetime import datetime, timedelta
+from datetime import datetime
 
 import numpy as np
 import torch
diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
index b862d58d2..530fb6107 100644
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -14,9 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ BertAbs configuration """
-import json
 import logging
-import sys
 
 from transformers import PretrainedConfig
 
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 9670b4f8c..226f8a1d8 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -16,10 +16,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 import six
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index edebd8ab0..c352625a8 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -21,14 +21,8 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import itertools
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index c4bcc55fd..abf58844d 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -20,14 +20,9 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import itertools
-import json
 import logging
-import math
 import os
-import sys
-from io import open
 
 import torch
 from torch import nn
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index b427df639..126a9eb05 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import XxxConfig, is_tf_available
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 21ab7e47c..2ad168b5b 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -17,10 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index 9b9a99960..001991df7 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -16,10 +16,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index 8aae69ad0..2f6ec6eda 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -15,10 +15,7 @@
 """ DistilBERT model configuration """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index 1bccdf9c4..e14923216 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -17,10 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 81b2c82c6..cc70a210f 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -17,10 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 686e1af4d..e0f1dedd7 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -16,10 +16,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 import six
 
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index 49e6ce303..38028cfbb 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -17,10 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index cadf350c9..a9b4cc955 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -15,10 +15,7 @@
 """ XLM configuration """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 5af883e8e..1404dfeae 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -16,10 +16,7 @@
 """ XLNet configuration """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import sys
-from io import open
 
 from .configuration_utils import PretrainedConfig
 
diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
index 5264d501a..4d8d55a1c 100644
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -14,9 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import csv
 import logging
-import sys
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index c6581374c..9c1c3edfe 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -1,4 +1,3 @@
-import collections
 import json
 import logging
 import os
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index 193a3d7d1..ed7a5bcc0 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -17,13 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import numpy as np
 import torch
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 18a96a887..943a75923 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -20,11 +20,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import copy
 import itertools
-import json
 import logging
 import math
-import sys
-from io import open
 
 import numpy as np
 import torch
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index dfdcc418d..65bbb4bbd 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 import os
-import warnings
 
 import torch
 from torch import nn
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index ad086c451..67fc2c9d9 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -17,13 +17,9 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
 import math
 import os
-import sys
-from io import open
 
 import torch
 import torch.nn as nn
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 66487755c..a3b9c74e1 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -17,12 +17,10 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
 import json
 import logging
 import math
 import os
-import sys
 from io import open
 
 import torch
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 199ec7422..2061f6ce7 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import copy
 import itertools
-import json
 import logging
 import math
 import os
-import sys
-from io import open
 
 import torch
 import torch.nn.functional as F
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index f67901618..c95455696 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -17,12 +17,8 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import math
-import os
 import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 6f878c549..2f791b69f 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -18,9 +18,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
-import os
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 8692e3eba..b5fa934a6 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -16,13 +16,9 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import itertools
-import json
 import logging
 import math
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index 85f9773e0..47870cfa7 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -17,13 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index f7f98ecfd..f6430eecc 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -17,13 +17,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index 068e9ac12..a1f972caf 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -18,13 +18,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index 33244eae8..dd312c4da 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -16,7 +16,6 @@
 """ A TF 2.0 Adaptive Softmax for Transformer XL model.
 """
 
-from collections import defaultdict
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index a29a0b7fe..9dcbcd7f6 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import itertools
 import logging
 import math
-import os
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 4bc8df2da..0fe898b16 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -17,12 +17,8 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import math
-import os
 import sys
-from io import open
 
 import numpy as np
 import tensorflow as tf
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index 4ac524ee8..e9bd1c489 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -20,13 +20,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
-import json
 import logging
-import math
-import os
-import sys
-from io import open
 
 import torch
 import torch.nn as nn
diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
index c41954164..1d600f74e 100644
--- a/transformers/modeling_transfo_xl_utilities.py
+++ b/transformers/modeling_transfo_xl_utilities.py
@@ -17,7 +17,6 @@
     Directly adapted from https://github.com/kimiyoung/transformer-xl.
 """
 
-from collections import defaultdict
 
 import numpy as np
 import torch
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index d09a4880b..242419362 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -17,11 +17,8 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
-import json
 import logging
 import os
-from io import open
 
 import six
 import torch
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index f87e09a3b..e90114fc0 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -17,12 +17,9 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
 import math
-import os
 import sys
-from io import open
 
 import torch
 from torch import nn
diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index 65a4a35ae..234301df2 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function
 
 import json
 import os
-import tempfile
 import unittest
 
 from .tokenization_tests_commons import TemporaryDirectory
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index b2bb54ffa..c2deedc2a 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -15,7 +15,6 @@
 from __future__ import absolute_import, division, print_function
 
 import logging
-import shutil
 import unittest
 
 from transformers import is_torch_available
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 42de8c9ae..1b61c8031 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -47,7 +47,6 @@ if is_torch_available():
     )
 
 if sys.version_info[0] == 2:
-    import cPickle as pickle
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
@@ -61,8 +60,6 @@ if sys.version_info[0] == 2:
 
 
 else:
-    import pickle
-
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index cdcd69104..b6b52dd0b 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import pdb
 import unittest
 
 from transformers import is_torch_available
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 344e999a0..342ad6558 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import AlbertConfig, is_tf_available
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 54581505e..3530339b6 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -15,7 +15,6 @@
 from __future__ import absolute_import, division, print_function
 
 import logging
-import shutil
 import unittest
 
 from transformers import is_tf_available
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index 735de447e..5e8c3259e 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import BertConfig, is_tf_available
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index ac43e7e87..bd7e6636a 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -15,16 +15,12 @@
 from __future__ import absolute_import, division, print_function
 
 import copy
-import importlib
-import json
-import logging
 import os
 import random
 import shutil
 import sys
 import tempfile
 import unittest
-import uuid
 
 from transformers import is_tf_available, is_torch_available
 
@@ -39,7 +35,6 @@ if is_tf_available():
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 if sys.version_info[0] == 2:
-    import cPickle as pickle
 
     class TemporaryDirectory(object):
         """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
@@ -53,8 +48,6 @@ if sys.version_info[0] == 2:
 
 
 else:
-    import pickle
-
     TemporaryDirectory = tempfile.TemporaryDirectory
     unicode = str
 
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 895579eab..41a2da2a4 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import CTRLConfig, is_tf_available
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 49bb10c43..e93399a27 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import GPT2Config, is_tf_available
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index 0198527f5..801cf23e1 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import OpenAIGPTConfig, is_tf_available
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index 2108b9007..9042a763f 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import sys
 import unittest
 
 from transformers import T5Config, is_tf_available
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 67fc1a5ce..0e0c70ed5 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import json
-import os
 import random
 import unittest
 
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index a0d950472..decd7f0f4 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
-import json
-import os
 import random
 import unittest
 
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 4ff2fa791..929f5f8a6 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -15,7 +15,6 @@
 from __future__ import absolute_import, division, print_function
 
 import logging
-import shutil
 import unittest
 
 from transformers import (
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 84119c081..526f823b7 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -15,7 +15,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
-import unittest
 from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index b7760e0eb..916db759c 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -14,9 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import unittest
-from io import open
 
 from transformers.tokenization_distilbert import DistilBertTokenizer
 
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 80c499051..c17d2a3ee 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -20,7 +20,6 @@ import collections
 import logging
 import os
 import unicodedata
-from io import open
 
 import six
 
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index bda5c6661..39dee1c3a 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -16,11 +16,8 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
 import logging
-import os
 import unicodedata
-from io import open
 
 from .tokenization_bert import BertTokenizer
 
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index 95472f5b3..9b34abfef 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -15,11 +15,7 @@
 """Tokenization classes for RoBERTa."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
-import os
-import sys
-from io import open
 
 import regex as re
 
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index af1562937..b46cbcd7b 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -20,9 +20,7 @@ rm MSRParaphraseCorpus.msi
 
 import argparse
 import os
-import shutil
 import sys
-import tempfile
 import urllib.request
 import zipfile
 
-- 
GitLab


From 783a61699962f4b058688db21d417e1932423417 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 21:54:07 +0100
Subject: [PATCH 26/32] Fix F401 flake8 warning (x88 / 116).

This change is mostly autogenerated with:

    $ python -m autoflake --in-place --recursive --remove-all-unused-imports --ignore-init-module-imports examples templates transformers utils hubconf.py setup.py

I made minor changes in the generated diff.
---
 examples/distillation/distiller.py               |  3 +--
 examples/distillation/scripts/extract.py         |  2 +-
 .../distillation/scripts/extract_distilbert.py   |  2 +-
 examples/run_bertology.py                        |  3 +--
 examples/run_squad.py                            |  2 +-
 examples/run_tf_ner.py                           |  1 -
 .../adding_a_new_model/configuration_xxx.py      |  2 --
 templates/adding_a_new_model/modeling_tf_xxx.py  |  2 --
 templates/adding_a_new_model/modeling_xxx.py     |  3 +--
 .../tests/modeling_tf_xxx_test.py                |  1 -
 templates/adding_a_new_model/tokenization_xxx.py |  1 -
 transformers/commands/convert.py                 |  1 -
 transformers/configuration_t5.py                 |  2 --
 .../convert_pytorch_checkpoint_to_tf2.py         |  2 --
 ...rta_original_pytorch_checkpoint_to_pytorch.py |  1 -
 transformers/data/metrics/squad_metrics.py       |  4 +---
 transformers/data/processors/squad.py            |  4 ++--
 transformers/hf_api.py                           |  1 -
 transformers/modeling_auto.py                    |  2 --
 transformers/modeling_ctrl.py                    |  3 +--
 transformers/modeling_distilbert.py              |  1 -
 transformers/modeling_encoder_decoder.py         |  1 -
 transformers/modeling_gpt2.py                    |  2 --
 transformers/modeling_openai.py                  |  1 -
 transformers/modeling_t5.py                      |  2 +-
 transformers/modeling_tf_auto.py                 |  1 -
 transformers/modeling_tf_ctrl.py                 |  2 +-
 transformers/modeling_tf_distilbert.py           |  1 -
 transformers/modeling_tf_pytorch_utils.py        | 16 ++++++++--------
 transformers/modeling_tf_roberta.py              |  2 +-
 transformers/modeling_tf_transfo_xl.py           |  3 +--
 transformers/modeling_tf_transfo_xl_utilities.py |  1 -
 transformers/modeling_transfo_xl.py              |  4 +---
 transformers/modeling_transfo_xl_utilities.py    |  1 -
 transformers/modeling_utils.py                   |  1 -
 transformers/pipelines.py                        |  1 -
 transformers/tests/modeling_auto_test.py         |  3 ---
 transformers/tests/modeling_distilbert_test.py   |  2 +-
 transformers/tests/modeling_t5_test.py           |  4 ++--
 transformers/tests/modeling_tf_albert_test.py    |  1 -
 transformers/tests/modeling_tf_auto_test.py      |  4 ----
 transformers/tests/modeling_tf_common_test.py    |  3 +--
 transformers/tests/modeling_tf_ctrl_test.py      |  1 -
 .../tests/modeling_tf_distilbert_test.py         |  3 +--
 transformers/tests/modeling_tf_t5_test.py        |  3 +--
 transformers/tests/tokenization_albert_test.py   |  2 +-
 .../tests/tokenization_distilbert_test.py        |  1 -
 .../tests/tokenization_transfo_xl_test.py        |  1 -
 transformers/tokenization_bert_japanese.py       |  1 -
 transformers/tokenization_distilbert.py          |  1 -
 transformers/tokenization_roberta.py             |  2 --
 transformers/tokenization_xlm.py                 |  1 -
 52 files changed, 30 insertions(+), 85 deletions(-)

diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 16d73ece3..c43f13de0 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -19,14 +19,13 @@ import math
 import os
 import time
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
+from tqdm import tqdm
 
 import psutil
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index f91b6d321..8d102c0cd 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -20,7 +20,7 @@ import argparse
 
 import torch
 
-from transformers import BertForMaskedLM, GPT2LMHeadModel, RobertaForMaskedLM
+from transformers import GPT2LMHeadModel, RobertaForMaskedLM
 
 
 if __name__ == "__main__":
diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py
index db0dc3ed8..972418b56 100644
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -20,7 +20,7 @@ import argparse
 
 import torch
 
-from transformers import BertForMaskedLM, RobertaForMaskedLM
+from transformers import BertForMaskedLM
 
 
 if __name__ == "__main__":
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 184e6a2e3..6d69477fc 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -26,8 +26,7 @@ from datetime import datetime
 
 import numpy as np
 import torch
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.utils.data import DataLoader, SequentialSampler, Subset, TensorDataset
+from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index acf4d0eec..6495d2972 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -26,7 +26,7 @@ import timeit
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index 68c4b15a0..c360f9e77 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -1,5 +1,4 @@
 # coding=utf-8
-import _pickle as pickle
 import collections
 import datetime
 import glob
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index 226f8a1d8..f2feb7360 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-import six
-
 from .configuration_utils import PretrainedConfig
 
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index c352625a8..df64c1922 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -21,10 +21,8 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import itertools
 import logging
 
-import numpy as np
 import tensorflow as tf
 
 from .configuration_xxx import XxxConfig
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index abf58844d..6db97df1b 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -20,7 +20,6 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import itertools
 import logging
 import os
 
@@ -30,7 +29,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from .configuration_xxx import XxxConfig
 from .file_utils import add_start_docstrings
-from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .modeling_utils import PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 126a9eb05..844f538b7 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -24,7 +24,6 @@ from .utils import CACHE_DIR, require_tf, slow
 
 
 if is_tf_available():
-    import tensorflow as tf
     from transformers.modeling_tf_xxx import (
         TFXxxModel,
         TFXxxForMaskedLM,
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 30e3ce567..690815b97 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import collections
 import logging
 import os
-import unicodedata
 from io import open
 
 from .tokenization_utils import PreTrainedTokenizer
diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py
index 29b6b1076..a858e13dd 100644
--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
@@ -1,7 +1,6 @@
 from argparse import ArgumentParser, Namespace
 from logging import getLogger
 
-from transformers import AutoModel, AutoTokenizer
 from transformers.commands import BaseTransformersCLICommand
 
 
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index e0f1dedd7..9ba1ada6d 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-import six
-
 from .configuration_utils import PretrainedConfig
 
 
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 649481430..54ffddbe9 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -20,8 +20,6 @@ import argparse
 import logging
 import os
 
-import tensorflow as tf
-
 from transformers import (
     ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 9044bc592..283370fc3 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -20,7 +20,6 @@ import argparse
 import logging
 import pathlib
 
-import numpy as np
 import torch
 from packaging import version
 
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 0009a2e70..2b9778bcd 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -16,9 +16,7 @@ import re
 import string
 from io import open
 
-from tqdm import tqdm
-
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 9c1c3edfe..8df4547c5 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -8,8 +8,8 @@ import numpy as np
 from tqdm import tqdm
 
 from ...file_utils import is_tf_available, is_torch_available
-from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
-from .utils import DataProcessor, InputExample, InputFeatures
+from ...tokenization_bert import whitespace_tokenize
+from .utils import DataProcessor
 
 
 if is_torch_available():
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 13469728d..9e287bd5b 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -21,7 +21,6 @@ from typing import List
 
 import requests
 import six
-from requests.exceptions import HTTPError
 from tqdm import tqdm
 
 
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 2df74130e..f5f8ed204 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -32,7 +32,6 @@ from .configuration_auto import (
     XLMRobertaConfig,
     XLNetConfig,
 )
-from .file_utils import add_start_docstrings
 from .modeling_albert import (
     ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     AlbertForMaskedLM,
@@ -76,7 +75,6 @@ from .modeling_roberta import (
 )
 from .modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5Model, T5WithLMHeadModel
 from .modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TransfoXLModel
-from .modeling_utils import PreTrainedModel, SequenceSummary
 from .modeling_xlm import (
     XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLMForQuestionAnswering,
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index ed7a5bcc0..9cd1ad731 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -23,11 +23,10 @@ import numpy as np
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
 
 from .configuration_ctrl import CTRLConfig
 from .file_utils import add_start_docstrings
-from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+from .modeling_utils import Conv1D, PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 943a75923..1c6cef1b3 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
-import itertools
 import logging
 import math
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index 65bbb4bbd..ec90dc7e4 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -21,7 +21,6 @@ import os
 
 import torch
 from torch import nn
-from tqdm import trange
 
 from .modeling_auto import AutoModel, AutoModelWithLMHead
 
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 67fc2c9d9..15ae12c15 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -24,7 +24,6 @@ import os
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
 
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
@@ -47,7 +46,6 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """
     try:
         import re
-        import numpy as np
         import tensorflow as tf
     except ImportError:
         logger.error(
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index a3b9c74e1..c3dec0109 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -26,7 +26,6 @@ from io import open
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
 
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 2061f6ce7..5c2cd403f 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -25,7 +25,7 @@ import os
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss
 
 from .configuration_t5 import T5Config
 from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index bd41893b6..e34f417a6 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -29,7 +29,6 @@ from .configuration_auto import (
     XLMConfig,
     XLNetConfig,
 )
-from .file_utils import add_start_docstrings
 from .modeling_tf_albert import (
     TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     TFAlbertForMaskedLM,
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 2f791b69f..5b73fb193 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -24,7 +24,7 @@ import tensorflow as tf
 
 from .configuration_ctrl import CTRLConfig
 from .file_utils import add_start_docstrings
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index b5fa934a6..95a5ec036 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -16,7 +16,6 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import itertools
 import logging
 import math
 
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 92ff8bf21..3882cd3c4 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -75,8 +75,8 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
     """ Load pytorch checkpoints in a TF 2.0 model
     """
     try:
-        import tensorflow as tf
-        import torch
+        import tensorflow as tf  # noqa: F401
+        import torch  # noqa: F401
     except ImportError as e:
         logger.error(
             "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
@@ -109,8 +109,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     """ Load pytorch state_dict in a TF 2.0 model.
     """
     try:
-        import torch
-        import tensorflow as tf
+        import torch  # noqa: F401
+        import tensorflow as tf  # noqa: F401
         from tensorflow.python.keras import backend as K
     except ImportError as e:
         logger.error(
@@ -208,8 +208,8 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
         (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
     """
     try:
-        import tensorflow as tf
-        import torch
+        import tensorflow as tf  # noqa: F401
+        import torch  # noqa: F401
     except ImportError as e:
         logger.error(
             "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
@@ -250,8 +250,8 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
     """ Load TF2.0 symbolic weights in a PyTorch model
     """
     try:
-        import tensorflow as tf
-        import torch
+        import tensorflow as tf  # noqa: F401
+        import torch  # noqa: F401
     except ImportError as e:
         logger.error(
             "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 7aca11f5b..9ad93c0b5 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 
 from .configuration_roberta import RobertaConfig
 from .file_utils import add_start_docstrings
-from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
+from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
 from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 
 
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index a1f972caf..1f3f7cd55 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -20,13 +20,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-import numpy as np
 import tensorflow as tf
 
 from .configuration_transfo_xl import TransfoXLConfig
 from .file_utils import add_start_docstrings
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
-from .modeling_tf_utils import TFConv1D, TFPreTrainedModel, TFSequenceSummary, get_initializer, shape_list
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index dd312c4da..cd32d8639 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -17,7 +17,6 @@
 """
 
 
-import numpy as np
 import tensorflow as tf
 
 from .modeling_tf_utils import shape_list
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index e9bd1c489..a6b71538e 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -25,13 +25,11 @@ import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
 
 from .configuration_transfo_xl import TransfoXLConfig
 from .file_utils import add_start_docstrings
 from .modeling_transfo_xl_utilities import LogUniformSampler, ProjectedAdaptiveLogSoftmax, sample_logits
-from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
+from .modeling_utils import PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
index 1d600f74e..63900c7b8 100644
--- a/transformers/modeling_transfo_xl_utilities.py
+++ b/transformers/modeling_transfo_xl_utilities.py
@@ -18,7 +18,6 @@
 """
 
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 242419362..6fe6fd532 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -20,7 +20,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 import os
 
-import six
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
index 14cb4ac84..1a18de0d9 100755
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -22,7 +22,6 @@ import pickle
 import sys
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from itertools import groupby
 from os.path import abspath, exists
 from typing import Dict, List, Optional, Tuple, Union
 
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index c2deedc2a..a174dca86 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -37,9 +37,6 @@ if is_torch_available():
     )
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-    from .modeling_common_test import CommonTestCases, ids_tensor
-    from .configuration_common_test import ConfigTester
-
 
 @require_torch
 class AutoModelTest(unittest.TestCase):
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index eee84af1c..1044f15ee 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -20,7 +20,7 @@ from transformers import is_torch_available
 
 from .configuration_common_test import ConfigTester
 from .modeling_common_test import CommonTestCases, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
+from .utils import require_torch, torch_device
 
 
 if is_torch_available():
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 2bf3bdae1..460037ea3 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -19,8 +19,8 @@ import unittest
 from transformers import is_torch_available
 
 from .configuration_common_test import ConfigTester
-from .modeling_common_test import CommonTestCases, floats_tensor, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow
 
 
 if is_torch_available():
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 342ad6558..a34f4b381 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -24,7 +24,6 @@ from .utils import CACHE_DIR, require_tf, slow
 
 
 if is_tf_available():
-    import tensorflow as tf
     from transformers.modeling_tf_albert import (
         TFAlbertModel,
         TFAlbertForMaskedLM,
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 3530339b6..b06d52ed2 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -35,10 +35,6 @@ if is_tf_available():
         TFAutoModelForQuestionAnswering,
         TFBertForQuestionAnswering,
     )
-    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    from .modeling_common_test import CommonTestCases, ids_tensor
-    from .configuration_common_test import ConfigTester
 
 
 @require_tf
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index bd7e6636a..fb85181a7 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -24,13 +24,12 @@ import unittest
 
 from transformers import is_tf_available, is_torch_available
 
-from .utils import require_tf, slow
+from .utils import require_tf
 
 
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
-    from transformers import TFPreTrainedModel
 
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 41a2da2a4..dad072cd3 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -24,7 +24,6 @@ from .utils import CACHE_DIR, require_tf, slow
 
 
 if is_tf_available():
-    import tensorflow as tf
     from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index ebb17e298..5b343c09a 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -20,11 +20,10 @@ from transformers import DistilBertConfig, is_tf_available
 
 from .configuration_common_test import ConfigTester
 from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
+from .utils import require_tf
 
 
 if is_tf_available():
-    import tensorflow as tf
     from transformers.modeling_tf_distilbert import (
         TFDistilBertModel,
         TFDistilBertForMaskedLM,
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index 9042a763f..84919bf43 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -24,8 +24,7 @@ from .utils import CACHE_DIR, require_tf, slow
 
 
 if is_tf_available():
-    import tensorflow as tf
-    from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
 
 
 @require_tf
diff --git a/transformers/tests/tokenization_albert_test.py b/transformers/tests/tokenization_albert_test.py
index 867dd5591..88d18031f 100644
--- a/transformers/tests/tokenization_albert_test.py
+++ b/transformers/tests/tokenization_albert_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from transformers.tokenization_albert import SPIECE_UNDERLINE, AlbertTokenizer
+from transformers.tokenization_albert import AlbertTokenizer
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index 916db759c..3417fc76d 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -19,7 +19,6 @@ import unittest
 from transformers.tokenization_distilbert import DistilBertTokenizer
 
 from .tokenization_bert_test import BertTokenizationTest
-from .tokenization_tests_commons import CommonTestCases
 from .utils import slow
 
 
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index cb9d3d4de..6b1d1dfce 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -25,7 +25,6 @@ from .utils import require_torch
 
 
 if is_torch_available():
-    import torch
     from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
 
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index c17d2a3ee..439d652be 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -24,7 +24,6 @@ import unicodedata
 import six
 
 from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
-from .tokenization_utils import PreTrainedTokenizer
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index 39dee1c3a..0821c859d 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
-import unicodedata
 
 from .tokenization_bert import BertTokenizer
 
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index 9b34abfef..bc1695f23 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-import regex as re
-
 from .tokenization_gpt2 import GPT2Tokenizer
 
 
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 9d315f880..062d2697a 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -25,7 +25,6 @@ from io import open
 
 import sacremoses as sm
 
-from .tokenization_bert import BasicTokenizer
 from .tokenization_utils import PreTrainedTokenizer
 
 
-- 
GitLab


From 939148b050930897510c26f6d2833ef8e8029fa2 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 22:04:51 +0100
Subject: [PATCH 27/32] Fix F401 flake8 warning (x28).

Do manually what autoflake couldn't manage.
---
 examples/run_bertology.py                            | 12 ------------
 .../adding_a_new_model/tests/modeling_tf_xxx_test.py |  1 -
 .../adding_a_new_model/tests/modeling_xxx_test.py    |  3 ---
 transformers/convert_pytorch_checkpoint_to_tf2.py    |  1 -
 ...roberta_original_pytorch_checkpoint_to_pytorch.py |  9 +--------
 transformers/modeling_auto.py                        |  2 --
 transformers/modeling_tf_utils.py                    | 10 +---------
 transformers/modeling_tf_xlm.py                      |  9 +--------
 transformers/modeling_xlnet.py                       |  9 +--------
 transformers/tests/modeling_common_test.py           |  3 ---
 transformers/tests/modeling_tf_bert_test.py          |  1 -
 11 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 6d69477fc..acac56128 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -31,18 +31,6 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
 from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
-from transformers import (
-    WEIGHTS_NAME,
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    XLMConfig,
-    XLMForSequenceClassification,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForSequenceClassification,
-    XLNetTokenizer,
-)
 from transformers import glue_compute_metrics as compute_metrics
 from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 844f538b7..cb0898488 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -30,7 +30,6 @@ if is_tf_available():
         TFXxxForSequenceClassification,
         TFXxxForTokenClassification,
         TFXxxForQuestionAnswering,
-        TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
 
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 4191922eb..1c9baa44f 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -28,12 +28,9 @@ if is_torch_available():
         XxxConfig,
         XxxModel,
         XxxForMaskedLM,
-        XxxForNextSentencePrediction,
-        XxxForPreTraining,
         XxxForQuestionAnswering,
         XxxForSequenceClassification,
         XxxForTokenClassification,
-        XxxForMultipleChoice,
     )
     from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
 
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 54ffddbe9..9eb8529fe 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -47,7 +47,6 @@ from transformers import (
     TFCTRLLMHeadModel,
     TFDistilBertForMaskedLM,
     TFDistilBertForQuestionAnswering,
-    TFDistilBertForSequenceClassification,
     TFGPT2LMHeadModel,
     TFOpenAIGPTLMHeadModel,
     TFRobertaForMaskedLM,
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 283370fc3..588205737 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -28,20 +28,13 @@ from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
 from transformers.modeling_bert import (
     BertConfig,
-    BertEncoder,
     BertIntermediate,
     BertLayer,
-    BertModel,
     BertOutput,
     BertSelfAttention,
     BertSelfOutput,
 )
-from transformers.modeling_roberta import (
-    RobertaEmbeddings,
-    RobertaForMaskedLM,
-    RobertaForSequenceClassification,
-    RobertaModel,
-)
+from transformers.modeling_roberta import RobertaForMaskedLM, RobertaForSequenceClassification
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index f5f8ed204..0bbefb4fd 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -50,7 +50,6 @@ from .modeling_bert import (
 from .modeling_camembert import (
     CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     CamembertForMaskedLM,
-    CamembertForMultipleChoice,
     CamembertForSequenceClassification,
     CamembertForTokenClassification,
     CamembertModel,
@@ -85,7 +84,6 @@ from .modeling_xlm import (
 from .modeling_xlm_roberta import (
     XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLMRobertaForMaskedLM,
-    XLMRobertaForMultipleChoice,
     XLMRobertaForSequenceClassification,
     XLMRobertaForTokenClassification,
     XLMRobertaModel,
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 69ceaaf5a..bd9df0091 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -25,15 +25,7 @@ import tensorflow as tf
 from tensorflow.python.keras.saving import hdf5_format
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import (
-    DUMMY_INPUTS,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_remote_url,
-)
+from .file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index 9dcbcd7f6..8ca5c6993 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -25,14 +25,7 @@ import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
 from .file_utils import add_start_docstrings
-from .modeling_tf_utils import (
-    DUMMY_INPUTS,
-    TFPreTrainedModel,
-    TFSequenceSummary,
-    TFSharedEmbeddings,
-    get_initializer,
-    shape_list,
-)
+from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index e90114fc0..9682c5a23 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -28,14 +28,7 @@ from torch.nn import functional as F
 
 from .configuration_xlnet import XLNetConfig
 from .file_utils import add_start_docstrings
-from .modeling_utils import (
-    PoolerAnswerClass,
-    PoolerEndLogits,
-    PoolerStartLogits,
-    PreTrainedModel,
-    SequenceSummary,
-    prune_linear_layer,
-)
+from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 1b61c8031..591aa648c 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -41,9 +41,6 @@ if is_torch_available():
         BertModel,
         BertConfig,
         BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2LMHeadModel,
-        GPT2Config,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
 if sys.version_info[0] == 2:
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index 5e8c3259e..e07ef4f2b 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -34,7 +34,6 @@ if is_tf_available():
         TFBertForMultipleChoice,
         TFBertForTokenClassification,
         TFBertForQuestionAnswering,
-        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
 
-- 
GitLab


From 7c6812645afa8ac8ff0c264a8d86f487aa4ed33b Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 22:13:14 +0100
Subject: [PATCH 28/32] Restore proper import for HTTPError.

---
 transformers/commands/user.py     | 4 +++-
 transformers/tests/hf_api_test.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index 65761ae98..c0c6a6473 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -3,8 +3,10 @@ from argparse import ArgumentParser
 from getpass import getpass
 from typing import List, Union
 
+from requests.exceptions import HTTPError
+
 from transformers.commands import BaseTransformersCLICommand
-from transformers.hf_api import HfApi, HfFolder, HTTPError
+from transformers.hf_api import HfApi, HfFolder
 
 
 class UserCommands(BaseTransformersCLICommand):
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index 0c86fab97..af72408d2 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -20,8 +20,9 @@ import unittest
 
 import requests
 import six
+from requests.exceptions import HTTPError
 
-from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
+from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
 
 
 USER = "__DUMMY_TRANSFORMERS_USER__"
-- 
GitLab


From 577a03664df498b1f68f6ce4d30155a2eee13ed8 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 22:31:44 +0100
Subject: [PATCH 29/32] Enforce flake8 in CI.

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 89b78ae69..35de1281f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -96,9 +96,10 @@ jobs:
             - checkout
             - run: sudo pip install --editable .
             - run: sudo pip install torch tensorflow tensorboardX scikit-learn
-            - run: sudo pip install black git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
+            - run: sudo pip install black git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort flake8
             - run: black --check --line-length 119 examples templates transformers utils
             - run: isort --check-only --recursive examples templates transformers utils
+            - run: flake8 examples templates transformers utils
     check_repository_consistency:
         working_directory: ~/transformers
         docker:
-- 
GitLab


From c9270086eac5dd5b0ed9c0fc736f1b27071b11f4 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sat, 21 Dec 2019 22:32:10 +0100
Subject: [PATCH 30/32] Disable flake8 F841 in CI to get a passing run.

I'll fix it later.
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 1b24e6d1e..1af9e2c3f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,5 +9,5 @@ multi_line_output = 3
 use_parentheses = True
 
 [flake8]
-ignore = E203, E501, W503
+ignore = E203, E501, F841, W503
 max-line-length = 119
-- 
GitLab


From 2a34d5b71baf236bda52f2885a10d1ac49d3f9b8 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sun, 22 Dec 2019 11:07:31 +0100
Subject: [PATCH 31/32] Stabilize import order for packaging.

I don't want to consider it a dependency of transformers, but it's
usually there in local development and usually not there in CI.
---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 1af9e2c3f..b3f95d39d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 known_first_party = transformers
+known_third_party = packaging
 line_length = 119
 lines_after_imports = 2
 multi_line_output = 3
-- 
GitLab


From c11b3e2926740d134dd83ce96b6ec64d6f29d49b Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sun, 22 Dec 2019 11:17:48 +0100
Subject: [PATCH 32/32] Sort imports for optional third-party libraries.

These libraries aren't always installed in the virtual environment where
isort is running. Declaring them properly avoids mixing these
third-party imports with local imports.
---
 .circleci/config.yml                             |  2 +-
 examples/distillation/distiller.py               |  2 +-
 examples/distillation/utils.py                   |  3 +--
 examples/mm-imdb/utils_mmimdb.py                 |  3 +--
 examples/pplm/run_pplm_discrim_train.py          |  6 +++---
 examples/run_ner.py                              |  2 +-
 examples/run_tf_glue.py                          |  2 +-
 examples/run_tf_ner.py                           |  2 +-
 setup.cfg                                        | 16 +++++++++++++++-
 ...rta_original_pytorch_checkpoint_to_pytorch.py |  6 +++---
 10 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 35de1281f..e46529556 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -95,7 +95,7 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --editable .
-            - run: sudo pip install torch tensorflow tensorboardX scikit-learn
+            - run: sudo pip install torch tensorflow
             - run: sudo pip install black git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort flake8
             - run: black --check --line-length 119 examples templates transformers utils
             - run: isort --check-only --recursive examples templates transformers utils
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index c43f13de0..53669623b 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -19,6 +19,7 @@ import math
 import os
 import time
 
+import psutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -27,7 +28,6 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-import psutil
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 from lm_seqs_dataset import LmSeqsDataset
 from transformers import get_linear_schedule_with_warmup
diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
index b081f239c..211e7c61d 100644
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -20,11 +20,10 @@ import logging
 import os
 import socket
 
+import git
 import numpy as np
 import torch
 
-import git
-
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py
index 7a52a99b1..aa0460639 100644
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -20,11 +20,10 @@ from collections import Counter
 
 import torch
 import torch.nn as nn
-from torch.utils.data import Dataset
-
 import torchvision
 import torchvision.transforms as transforms
 from PIL import Image
+from torch.utils.data import Dataset
 
 
 POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py
index 1c21c56c8..44f6b726d 100644
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -26,12 +26,12 @@ import torch
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data as data
-from tqdm import tqdm, trange
-
 from nltk.tokenize.treebank import TreebankWordDetokenizer
-from pplm_classification_head import ClassificationHead
 from torchtext import data as torchtext_data
 from torchtext import datasets
+from tqdm import tqdm, trange
+
+from pplm_classification_head import ClassificationHead
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 7120c3736..34ba2663b 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -25,13 +25,13 @@ import random
 
 import numpy as np
 import torch
+from seqeval.metrics import f1_score, precision_score, recall_score
 from tensorboardX import SummaryWriter
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from seqeval.metrics import f1_score, precision_score, recall_score
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 511a98e94..8398ccb4c 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,8 +1,8 @@
 import os
 
 import tensorflow as tf
-
 import tensorflow_datasets
+
 from transformers import (
     BertConfig,
     BertForSequenceClassification,
diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py
index c360f9e77..6aa0f4bc3 100644
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -9,9 +9,9 @@ import re
 import numpy as np
 import tensorflow as tf
 from absl import app, flags, logging
-
 from fastprogress import master_bar, progress_bar
 from seqeval import metrics
+
 from transformers import (
     TF2_WEIGHTS_NAME,
     BertConfig,
diff --git a/setup.cfg b/setup.cfg
index b3f95d39d..f59ce55df 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,7 +3,21 @@ ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 known_first_party = transformers
-known_third_party = packaging
+known_third_party =
+    fairseq
+    fastprogress
+    git
+    nltk
+    packaging
+    PIL
+    psutil
+    seqeval
+    sklearn
+    tensorboardX
+    tensorflow_datasets
+    torchtext
+    torchvision
+
 line_length = 119
 lines_after_imports = 2
 multi_line_output = 3
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 588205737..7e86f3a93 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -20,12 +20,12 @@ import argparse
 import logging
 import pathlib
 
-import torch
-from packaging import version
-
 import fairseq
+import torch
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
+from packaging import version
+
 from transformers.modeling_bert import (
     BertConfig,
     BertIntermediate,
-- 
GitLab