Merge pull request #2255 from aaugustin/implement-best-practices

Implement some Python best practices

Merge pull request #2255 from aaugustin/implement-best-practices
Implement some Python best practices
54abc67a · Thomas Wolf · GitHub · 645713e2 · c11b3e29 · 54abc67a
Unverified Commit 54abc67a authored Dec 22, 2019 by Thomas Wolf Committed by GitHub Dec 22, 2019
20 changed files
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -21,6 +21,7 @@ import logging
 import os
 from io import open
 logger = logging.getLogger(__name__)
@@ -61,9 +62,7 @@ def read_examples_from_file(data_dir, mode):
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
-                                                 words=words,
-                                                 labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
@@ -76,27 +75,27 @@ def read_examples_from_file(data_dir, mode):
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
-            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
-                                         words=words,
-                                         labels=labels))
    return examples
-def convert_examples_to_features(examples,
+def convert_examples_to_features(
-                                 label_list,
+    examples,
-                                 max_seq_length,
+    label_list,
-                                 tokenizer,
+    max_seq_length,
-                                 cls_token_at_end=False,
+    tokenizer,
-                                 cls_token="[CLS]",
+    cls_token_at_end=False,
-                                 cls_token_segment_id=1,
+    cls_token="[CLS]",
-                                 sep_token="[SEP]",
+    cls_token_segment_id=1,
-                                 sep_token_extra=False,
+    sep_token="[SEP]",
-                                 pad_on_left=False,
+    sep_token_extra=False,
-                                 pad_token=0,
+    pad_on_left=False,
-                                 pad_token_segment_id=0,
+    pad_token=0,
-                                 pad_token_label_id=-100,
+    pad_token_segment_id=0,
-                                 sequence_a_segment_id=0,
+    pad_token_label_id=-100,
-                                 mask_padding_with_zero=True):
+    sequence_a_segment_id=0,
+    mask_padding_with_zero=True,
+):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -122,8 +121,8 @@ def convert_examples_to_features(examples,
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[:(max_seq_length - special_tokens_count)]
+            tokens = tokens[: (max_seq_length - special_tokens_count)]
-            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
        # The convention in BERT is:
        # (a) For sequence pairs:
@@ -174,10 +173,10 @@ def convert_examples_to_features(examples,
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
-            input_ids += ([pad_token] * padding_length)
+            input_ids += [pad_token] * padding_length
-            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
+            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
-            segment_ids += ([pad_token_segment_id] * padding_length)
+            segment_ids += [pad_token_segment_id] * padding_length
-            label_ids += ([pad_token_label_id] * padding_length)
+            label_ids += [pad_token_label_id] * padding_length
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
@@ -194,10 +193,8 @@ def convert_examples_to_features(examples,
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
        features.append(
-                InputFeatures(input_ids=input_ids,
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
-                              input_mask=input_mask,
+        )
-                              segment_ids=segment_ids,
-                              label_ids=label_ids))
    return features
@@ -209,4 +206,4 @@ def get_labels(path):
            labels = ["O"] + labels
        return labels
    else:
-        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
--- a/hubconf.py
+++ b/hubconf.py
 from transformers import (
-    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+    AutoConfig,
+    AutoModel,
+    AutoModelForQuestionAnswering,
+    AutoModelForSequenceClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
 )
 from transformers.file_utils import add_start_docstrings
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
+dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
 @add_start_docstrings(AutoConfig.__doc__)
 def config(*args, **kwargs):
-    r""" 
+    r"""
                # Using torch.hub !
                import torch
@@ -27,7 +34,7 @@ def config(*args, **kwargs):
 @add_start_docstrings(AutoTokenizer.__doc__)
 def tokenizer(*args, **kwargs):
-    r""" 
+    r"""
        # Using torch.hub !
        import torch
@@ -57,6 +64,7 @@ def model(*args, **kwargs):
    return AutoModel.from_pretrained(*args, **kwargs)
 @add_start_docstrings(AutoModelWithLMHead.__doc__)
 def modelWithLMHead(*args, **kwargs):
    r"""

--- a/setup.cfg
+++ b/setup.cfg
+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = transformers
+known_third_party =
+    fairseq
+    fastprogress
+    git
+    nltk
+    packaging
+    PIL
+    psutil
+    seqeval
+    sklearn
+    tensorboardX
+    tensorflow_datasets
+    torchtext
+    torchvision
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E501, F841, W503
+max-line-length = 119
--- a/setup.py
+++ b/setup.py
@@ -34,15 +34,16 @@ To create the package for pypi.
 """
 from io import open
 from setuptools import find_packages, setup
 extras = {
-    'serving': ['pydantic', 'uvicorn', 'fastapi'],
+    "serving": ["pydantic", "uvicorn", "fastapi"],
-    'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'],
+    "serving-tf": ["pydantic", "uvicorn", "fastapi", "tensorflow"],
-    'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch']
+    "serving-torch": ["pydantic", "uvicorn", "fastapi", "torch"],
 }
-extras['all'] = [package for package in extras.values()]
+extras["all"] = [package for package in extras.values()]
 setup(
    name="transformers",
@@ -50,30 +51,29 @@ setup(
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
-    long_description=open("README.md", "r", encoding='utf-8').read(),
+    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
+    keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
-    license='Apache',
+    license="Apache",
    url="https://github.com/huggingface/transformers",
-    packages=find_packages(exclude=["*.tests", "*.tests.*",
+    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
-                                    "tests.*", "tests"]),
+    install_requires=[
-    install_requires=['numpy',
+        "numpy",
-                      'boto3',
+        "boto3",
-                      'filelock',
+        "filelock",
-                      'requests',
+        "requests",
-                      'tqdm',
+        "tqdm",
-                      'regex != 2019.12.17',
+        "regex != 2019.12.17",
-                      'sentencepiece',
+        "sentencepiece",
-                      'sacremoses'],
+        "sacremoses",
-    extras_require=extras,
-    scripts=[
-        'transformers-cli'
    ],
+    extras_require=extras,
+    scripts=["transformers-cli"],
    # python_requires='>=3.5.0',
    classifiers=[
-          'Intended Audience :: Science/Research',
+        "Intended Audience :: Science/Research",
-          'License :: OSI Approved :: Apache Software License',
+        "License :: OSI Approved :: Apache Software License",
-          'Programming Language :: Python :: 3',
+        "Programming Language :: Python :: 3",
-          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
 )
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -17,55 +17,70 @@
 from __future__ import absolute_import, division, print_function
 import argparse
+import glob
 import logging
 import os
 import random
-import glob
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except:
-    from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (
-                                  BertForQuestionAnswering, BertTokenizer,
+    WEIGHTS_NAME,
-                                  XLMConfig, XLMForQuestionAnswering,
+    AdamW,
-                                  XLMTokenizer, XLNetConfig,
+    BertConfig,
-                                  XLNetForQuestionAnswering,
+    BertForQuestionAnswering,
-                                  XLNetTokenizer,
+    BertTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    DistilBertConfig,
+    DistilBertForQuestionAnswering,
-from transformers import AdamW, get_linear_schedule_with_warmup
+    DistilBertTokenizer,
+    XLMConfig,
-from utils_squad import (read_squad_examples, convert_examples_to_features,
+    XLMForQuestionAnswering,
-                         RawResult, write_predictions,
+    XLMTokenizer,
-                         RawResultExtended, write_predictions_extended)
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from utils_squad import (
+    RawResult,
+    RawResultExtended,
+    convert_examples_to_features,
+    read_squad_examples,
+    write_predictions,
+    write_predictions_extended,
+)
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+from utils_squad_evaluate import EVAL_OPTS
+from utils_squad_evaluate import main as evaluate_on_squad
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
 logger = logging.getLogger(__name__)
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+ALL_MODELS = sum(
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
+)
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
@@ -73,9 +88,11 @@ def set_seed(args):
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def to_list(tensor):
    return tensor.detach().cpu().tolist()
 def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
@@ -92,13 +109,18 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-        ]
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -112,17 +134,21 @@ def train(args, train_dataset, model, tokenizer):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(
-                                                          output_device=args.local_rank,
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-                                                          find_unused_parameters=True)
+        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+    logger.info(
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
@@ -136,20 +162,21 @@ def train(args, train_dataset, model, tokenizer):
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
+            inputs = {
-                      'attention_mask':  batch[1],
+                "input_ids": batch[0],
-                      'start_positions': batch[3],
+                "attention_mask": batch[1],
-                      'end_positions':   batch[4]}
+                "start_positions": batch[3],
-            if args.model_type != 'distilbert':
+                "end_positions": batch[4],
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+            }
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type != "distilbert":
-                inputs.update({'cls_index': batch[5],
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
-                               'p_mask':       batch[6]})
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
@@ -173,22 +200,26 @@ def train(args, train_dataset, model, tokenizer):
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
@@ -224,32 +255,31 @@ def evaluate(args, model, tokenizer, prefix=""):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
-                      'attention_mask': batch[1]
+            if args.model_type != "distilbert":
-                      }
+                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
            example_indices = batch[3]
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({'cls_index': batch[4],
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-                               'p_mask':    batch[5]})
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
+            if args.model_type in ["xlnet", "xlm"]:
                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
+                result = RawResultExtended(
-                                           start_top_log_probs  = to_list(outputs[0][i]),
+                    unique_id=unique_id,
-                                           start_top_index      = to_list(outputs[1][i]),
+                    start_top_log_probs=to_list(outputs[0][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
+                    start_top_index=to_list(outputs[1][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
+                    end_top_log_probs=to_list(outputs[2][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+                    end_top_index=to_list(outputs[3][i]),
+                    cls_logits=to_list(outputs[4][i]),
+                )
            else:
-                result = RawResult(unique_id    = unique_id,
+                result = RawResult(
-                                   start_logits = to_list(outputs[0][i]),
+                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
-                                   end_logits   = to_list(outputs[1][i]))
+                )
            all_results.append(result)
    # Compute predictions
@@ -260,23 +290,44 @@ def evaluate(args, model, tokenizer, prefix=""):
    else:
        output_null_log_odds_file = None
-    if args.model_type in ['xlnet', 'xlm']:
+    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
+        write_predictions_extended(
-                        args.max_answer_length, output_prediction_file,
+            examples,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+            features,
-                        model.config.start_n_top, model.config.end_n_top,
+            all_results,
-                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.predict_file,
+            model.config.start_n_top,
+            model.config.end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
    else:
-        write_predictions(examples, features, all_results, args.n_best_size,
+        write_predictions(
-                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+            examples,
-                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+            features,
-                        args.version_2_with_negative, args.null_score_diff_threshold)
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+        )
    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+    evaluate_options = EVAL_OPTS(
-                                 pred_file=output_prediction_file,
+        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
-                                 na_prob_file=output_null_log_odds_file)
+    )
    results = evaluate_on_squad(evaluate_options)
    return results
@@ -287,24 +338,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+    cached_features_file = os.path.join(
-        'dev' if evaluate else 'train',
+        os.path.dirname(input_file),
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        "cached_{}_{}_{}".format(
-        str(args.max_seq_length)))
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
+        examples = read_squad_examples(
-                                                is_training=not evaluate,
+            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
-                                                version_2_with_negative=args.version_2_with_negative)
+        )
-        features = convert_examples_to_features(examples=examples,
+        features = convert_examples_to_features(
-                                                tokenizer=tokenizer,
+            examples=examples,
-                                                max_seq_length=args.max_seq_length,
+            tokenizer=tokenizer,
-                                                doc_stride=args.doc_stride,
+            max_seq_length=args.max_seq_length,
-                                                max_query_length=args.max_query_length,
+            doc_stride=args.doc_stride,
-                                                is_training=not evaluate)
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
@@ -320,14 +377,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    if evaluate:
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+        dataset = TensorDataset(
-                                all_example_index, all_cls_index, all_p_mask)
+            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
+        )
    else:
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+        dataset = TensorDataset(
-                                all_start_positions, all_end_positions,
+            all_input_ids,
-                                all_cls_index, all_p_mask)
+            all_input_mask,
+            all_segment_ids,
+            all_start_positions,
+            all_end_positions,
+            all_cls_index,
+            all_p_mask,
+        )
    if output_examples:
        return dataset, examples, features
@@ -337,110 +401,191 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
 def main():
    parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
+    parser.add_argument(
-                        help="SQuAD json for training. E.g., train-v1.1.json")
+        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
+    )
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument(
-    parser.add_argument("--model_type", default=None, type=str, required=True,
+        "--predict_file",
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+        default=None,
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+        type=str,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+        required=True,
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
+        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    )
+    parser.add_argument(
-    ## Other parameters
+        "--model_type",
-    parser.add_argument("--config_name", default="", type=str,
+        default=None,
-                        help="Pretrained config name or path if not the same as model_name")
+        type=str,
-    parser.add_argument("--tokenizer_name", default="", type=str,
+        required=True,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    parser.add_argument("--cache_dir", default="", type=str,
+    )
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument(
+        "--model_name_or_path",
-    parser.add_argument('--version_2_with_negative', action='store_true',
+        default=None,
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
+        type=str,
-    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+        required=True,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
-    parser.add_argument("--max_seq_length", default=384, type=int,
+    parser.add_argument(
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "--output_dir",
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+        default=None,
-    parser.add_argument("--doc_stride", default=128, type=int,
+        type=str,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+        required=True,
-    parser.add_argument("--max_query_length", default=64, type=int,
+        help="The output directory where the model checkpoints and predictions will be written.",
-                        help="The maximum number of tokens for the question. Questions longer than this will "
+    )
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true',
+    # Other parameters
-                        help="Whether to run training.")
+    parser.add_argument(
-    parser.add_argument("--do_eval", action='store_true',
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-                        help="Whether to run eval on the dev set.")
+    )
-    parser.add_argument("--evaluate_during_training", action='store_true',
+    parser.add_argument(
-                        help="Rul evaluation during training at each logging step.")
+        "--tokenizer_name",
-    parser.add_argument("--do_lower_case", action='store_true',
+        default="",
-                        help="Set this flag if you are using an uncased model.")
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+    )
-                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+        "--cache_dir",
-                        help="Batch size per GPU/CPU for evaluation.")
+        default="",
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
+        type=str,
-                        help="The initial learning rate for Adam.")
+        help="Where do you want to store the pre-trained models downloaded from s3",
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+    )
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
+    parser.add_argument(
-                        help="Weight deay if we apply some.")
+        "--version_2_with_negative",
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+        action="store_true",
-                        help="Epsilon for Adam optimizer.")
+        help="If true, the SQuAD examples contain some that do not have an answer.",
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+    )
-                        help="Max gradient norm.")
+    parser.add_argument(
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+        "--null_score_diff_threshold",
-                        help="Total number of training epochs to perform.")
+        type=float,
-    parser.add_argument("--max_steps", default=-1, type=int,
+        default=0.0,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+        help="If null_score - best_non_null is greater than the threshold predict null.",
-    parser.add_argument("--warmup_steps", default=0, type=int,
+    )
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--n_best_size", default=20, type=int,
+    parser.add_argument(
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+        "--max_seq_length",
-    parser.add_argument("--max_answer_length", default=30, type=int,
+        default=384,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
+        type=int,
-                             "and end predictions are not conditioned on one another.")
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-    parser.add_argument("--verbose_logging", action='store_true',
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
-                        help="If true, all of the warnings related to data processing will be printed. "
+    )
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument(
+        "--doc_stride",
-    parser.add_argument('--logging_steps', type=int, default=50,
+        default=128,
-                        help="Log every X updates steps.")
+        type=int,
-    parser.add_argument('--save_steps', type=int, default=50,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-                        help="Save checkpoint every X updates steps.")
+    )
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
+    parser.add_argument(
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+        "--max_query_length",
-    parser.add_argument("--no_cuda", action='store_true',
+        default=64,
-                        help="Whether not to use CUDA when available")
+        type=int,
-    parser.add_argument('--overwrite_output_dir', action='store_true',
+        help="The maximum number of tokens for the question. Questions longer than this will "
-                        help="Overwrite the content of the output directory")
+        "be truncated to this length.",
-    parser.add_argument('--overwrite_cache', action='store_true',
+    )
-                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument('--seed', type=int, default=42,
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-                        help="random seed for initialization")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    parser.add_argument("--local_rank", type=int, default=-1,
+    )
-                        help="local_rank for distributed training on gpus")
+    parser.add_argument(
-    parser.add_argument('--fp16', action='store_true',
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    )
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument(
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -452,16 +597,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    logging.basicConfig(
-                        datefmt = '%m/%d/%Y %H:%M:%S',
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+        datefmt="%m/%d/%Y %H:%M:%S",
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
    # Set seed
    set_seed(args)
@@ -472,15 +625,21 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+    config = config_class.from_pretrained(
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
+        args.config_name if args.config_name else args.model_name_or_path,
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
-                                                do_lower_case=args.do_lower_case,
+    )
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(
-    model = model_class.from_pretrained(args.model_name_or_path,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+        do_lower_case=args.do_lower_case,
-                                        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -495,7 +654,8 @@ def main():
    if args.fp16:
        try:
            import apex
-            apex.amp.register_half_function(torch, 'einsum')
+            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
@@ -505,7 +665,6 @@ def main():
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -515,39 +674,42 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)
-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)
    logger.info("Results: {}".format(results))

--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
 # coding=utf-8
 # Copyright 2018 XXX.  All rights reserved.
 #
@@ -17,16 +16,17 @@
 from __future__ import absolute_import, division, print_function
+import collections
 import json
 import logging
 import math
-import collections
 from io import open
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
+from utils_squad_evaluate import find_all_best_thresh_v2, get_raw_scores, make_qid_to_has_ans
 logger = logging.getLogger(__name__)
@@ -37,14 +37,16 @@ class SquadExample(object):
    For examples without an answer, the start and end position are -1.
    """
-    def __init__(self,
+    def __init__(
-                 qas_id,
+        self,
-                 question_text,
+        qas_id,
-                 doc_tokens,
+        question_text,
-                 orig_answer_text=None,
+        doc_tokens,
-                 start_position=None,
+        orig_answer_text=None,
-                 end_position=None,
+        start_position=None,
-                 is_impossible=None):
+        end_position=None,
+        is_impossible=None,
+    ):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
@@ -59,8 +61,7 @@ class SquadExample(object):
    def __repr__(self):
        s = ""
        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
+        s += ", question_text: %s" % (self.question_text)
-            self.question_text)
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
@@ -74,22 +75,24 @@ class SquadExample(object):
 class InputFeatures(object):
    """A single set of features of data."""
-    def __init__(self,
+    def __init__(
-                 unique_id,
+        self,
-                 example_index,
+        unique_id,
-                 doc_span_index,
+        example_index,
-                 tokens,
+        doc_span_index,
-                 token_to_orig_map,
+        tokens,
-                 token_is_max_context,
+        token_to_orig_map,
-                 input_ids,
+        token_is_max_context,
-                 input_mask,
+        input_ids,
-                 segment_ids,
+        input_mask,
-                 cls_index,
+        segment_ids,
-                 p_mask,
+        cls_index,
-                 paragraph_len,
+        p_mask,
-                 start_position=None,
+        paragraph_len,
-                 end_position=None,
+        start_position=None,
-                 is_impossible=None):
+        end_position=None,
+        is_impossible=None,
+    ):
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
@@ -109,7 +112,7 @@ class InputFeatures(object):
 def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
+    with open(input_file, "r", encoding="utf-8") as reader:
        input_data = json.load(reader)["data"]
    def is_whitespace(c):
@@ -146,8 +149,7 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
+                        raise ValueError("For training, each question should have exactly 1 answer.")
-                            "For training, each question should have exactly 1 answer.")
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
@@ -161,12 +163,10 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
-                        cleaned_answer_text = " ".join(
+                        cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
-                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
+                            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
-                                           actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
@@ -180,18 +180,29 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
-                    is_impossible=is_impossible)
+                    is_impossible=is_impossible,
+                )
                examples.append(example)
    return examples
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
+def convert_examples_to_features(
-                                 doc_stride, max_query_length, is_training,
+    examples,
-                                 cls_token_at_end=False,
+    tokenizer,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+    max_seq_length,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+    doc_stride,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
+    max_query_length,
-                                 mask_padding_with_zero=True):
+    is_training,
+    cls_token_at_end=False,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
    """Loads a data file into a list of `InputBatch`s."""
    unique_id = 1000000000
@@ -232,8 +243,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
-                example.orig_answer_text)
+            )
        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
@@ -241,8 +252,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
@@ -287,8 +297,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(sequence_b_segment_id)
@@ -333,8 +342,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                out_of_span = False
-                if not (tok_start_position >= doc_start and
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                        tok_end_position <= doc_end):
                    out_of_span = True
                if out_of_span:
                    start_position = 0
@@ -355,24 +363,23 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                if is_training and span_is_impossible:
                    logger.info("impossible example")
                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
+                    logger.info("answer: %s" % (answer_text))
-                        "answer: %s" % (answer_text))
            features.append(
                InputFeatures(
@@ -390,14 +397,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    paragraph_len=paragraph_len,
                    start_position=start_position,
                    end_position=end_position,
-                    is_impossible=span_is_impossible))
+                    is_impossible=span_is_impossible,
+                )
+            )
            unique_id += 1
    return features
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-                         orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    # The SQuAD annotations are character based. We first project them to
@@ -426,7 +434,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)
@@ -470,13 +478,23 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
    return cur_span_index == best_span_index
-RawResult = collections.namedtuple("RawResult",
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
-                                   ["unique_id", "start_logits", "end_logits"])
-def write_predictions(all_examples, all_features, all_results, n_best_size,
+def write_predictions(
-                      max_answer_length, do_lower_case, output_prediction_file,
+    all_examples,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+    all_features,
-                      version_2_with_negative, null_score_diff_threshold):
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+):
    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
    logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -490,8 +508,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
        unique_id_to_result[result.unique_id] = result
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+    )
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
@@ -544,7 +562,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
        if version_2_with_negative:
            prelim_predictions.append(
                _PrelimPrediction(
@@ -552,14 +572,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                    start_index=0,
                    end_index=0,
                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
+                    end_logit=null_end_logit,
-        prelim_predictions = sorted(
+                )
-            prelim_predictions,
+            )
-            key=lambda x: (x.start_logit + x.end_logit),
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
-            reverse=True)
        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
        seen_predictions = {}
        nbest = []
@@ -568,10 +588,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
                tok_text = " ".join(tok_tokens)
                # De-tokenize WordPieces that have been split off.
@@ -592,31 +612,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                final_text = ""
                seen_predictions[final_text] = True
-            nbest.append(
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
        # if we didn't include the empty option in the n-best, include it
        if version_2_with_negative:
            if "" not in seen_predictions:
-                nbest.append(
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
+            if len(nbest) == 1:
-                nbest.insert(0,
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
-            nbest.append(
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
        assert len(nbest) >= 1
@@ -645,8 +655,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
            all_predictions[example.qas_id] = nbest_json[0]["text"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-                best_non_null_entry.end_logit)
            scores_diff_json[example.qas_id] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example.qas_id] = ""
@@ -668,29 +677,40 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
 # For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple("RawResultExtended",
+RawResultExtended = collections.namedtuple(
-    ["unique_id", "start_top_log_probs", "start_top_index",
+    "RawResultExtended",
-     "end_top_log_probs", "end_top_index", "cls_logits"])
+    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits"],
+)
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
+def write_predictions_extended(
-                                output_nbest_file,
+    all_examples,
-                                output_null_log_odds_file, orig_data_file,
+    all_features,
-                                start_n_top, end_n_top, version_2_with_negative,
+    all_results,
-                                tokenizer, verbose_logging):
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    orig_data_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
    """ XLNet write prediction logic (more complex than Bert's).
        Write final predictions to the json file and log-odds of null if needed.
        Requires utils_squad_evaluate.py
    """
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
-        ["feature_index", "start_index", "end_index",
+    )
-        "start_log_prob", "end_log_prob"])
    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
    logger.info("Writing predictions to: %s", output_prediction_file)
    # logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -754,12 +774,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
                            start_index=start_index,
                            end_index=end_index,
                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob))
+                            end_log_prob=end_log_prob,
+                        )
+                    )
        prelim_predictions = sorted(
-            prelim_predictions,
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
-            key=lambda x: (x.start_log_prob + x.end_log_prob),
+        )
-            reverse=True)
        seen_predictions = {}
        nbest = []
@@ -770,7 +791,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            # XLNet un-tokenizer
            # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
            # tok_start_to_orig_index = feature.tok_start_to_orig_index
            # tok_end_to_orig_index = feature.tok_end_to_orig_index
            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
@@ -779,10 +800,10 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
            orig_doc_start = feature.token_to_orig_map[pred.start_index]
            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
            # Clean whitespace
@@ -790,8 +811,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
-                                        verbose_logging)
            if final_text in seen_predictions:
                continue
@@ -799,17 +819,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            seen_predictions[final_text] = True
            nbest.append(
-                _NbestPrediction(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
-                    text=final_text,
+            )
-                    start_log_prob=pred.start_log_prob,
-                    end_log_prob=pred.end_log_prob))
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
-            nbest.append(
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
-                _NbestPrediction(text="", start_log_prob=-1e6,
-                end_log_prob=-1e6))
        total_scores = []
        best_non_null_entry = None
@@ -850,7 +866,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
+    with open(orig_data_file, "r", encoding="utf-8") as reader:
        orig_data = json.load(reader)["data"]
    qid_to_has_ans = make_qid_to_has_ans(orig_data)
@@ -914,8 +930,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
-            logger.info(
+            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1
@@ -924,8 +939,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
-                        orig_ns_text, tok_ns_text)
        return orig_text
    # We then project the characters in `pred_text` back to `orig_text` using
@@ -956,7 +970,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
            logger.info("Couldn't map end position")
        return orig_text
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
    return output_text

--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -16,19 +16,16 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import json
 import logging
-import sys
-import six
-from io import open
 from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
 }
@@ -63,24 +60,26 @@ class XxxConfig(PretrainedConfig):
    """
    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
-    def __init__(self,
+    def __init__(
-                 vocab_size=50257,
+        self,
-                 n_positions=1024,
+        vocab_size=50257,
-                 n_ctx=1024,
+        n_positions=1024,
-                 n_embd=768,
+        n_ctx=1024,
-                 n_layer=12,
+        n_embd=768,
-                 n_head=12,
+        n_layer=12,
-                 resid_pdrop=0.1,
+        n_head=12,
-                 embd_pdrop=0.1,
+        resid_pdrop=0.1,
-                 attn_pdrop=0.1,
+        embd_pdrop=0.1,
-                 layer_norm_epsilon=1e-5,
+        attn_pdrop=0.1,
-                 initializer_range=0.02,
+        layer_norm_epsilon=1e-5,
-                 summary_type='cls_index',
+        initializer_range=0.02,
-                 summary_use_proj=True,
+        summary_type="cls_index",
-                 summary_activation=None,
+        summary_use_proj=True,
-                 summary_proj_to_labels=True,
+        summary_activation=None,
-                 summary_first_dropout=0.1,
+        summary_proj_to_labels=True,
-                 **kwargs):
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
        super(XxxConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_ctx = n_ctx

--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -14,18 +14,19 @@
 # limitations under the License.
 """Convert XXX checkpoint."""
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import argparse
+import logging
 import torch
 from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
-import logging
 logging.basicConfig(level=logging.INFO)
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = XxxConfig.from_json_file(config_file)
@@ -42,24 +43,20 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    ## Required parameters
+    # Required parameters
-    parser.add_argument("--tf_checkpoint_path",
+    parser.add_argument(
-                        default = None,
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-                        type = str,
+    )
-                        required = True,
+    parser.add_argument(
-                        help = "Path to the TensorFlow checkpoint path.")
+        "--config_file",
-    parser.add_argument("--config_file",
+        default=None,
-                        default = None,
+        type=str,
-                        type = str,
+        required=True,
-                        required = True,
+        help="The config json file corresponding to the pre-trained model. \n"
-                        help = "The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
-                            "This specifies the model architecture.")
+    )
-    parser.add_argument("--pytorch_dump_path",
+    parser.add_argument(
-                        default = None,
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-                        type = str,
+    )
-                        required = True,
-                        help = "Path to the output PyTorch model.")
    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
-                                     args.config_file,
-                                     args.pytorch_dump_path)
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -21,21 +21,14 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import json
 import logging
-import math
-import os
-import sys
-import copy
-import itertools
-from io import open
-import numpy as np
 import tensorflow as tf
 from .configuration_xxx import XxxConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 logger = logging.getLogger(__name__)
@@ -44,10 +37,11 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
 }
 ####################################################
 # TF 2.0 Models are constructed using Keras imperative API by sub-classing
 # - tf.keras.layers.Layer for the layers and
@@ -66,12 +60,20 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+TFXxxAttention = tf.keras.layers.Layer
+TFXxxIntermediate = tf.keras.layers.Layer
+TFXxxOutput = tf.keras.layers.Layer
 class TFXxxLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFXxxLayer, self).__init__(**kwargs)
-        self.attention = TFXxxAttention(config, name='attention')
+        self.attention = TFXxxAttention(config, name="attention")
-        self.intermediate = TFXxxIntermediate(config, name='intermediate')
+        self.intermediate = TFXxxIntermediate(config, name="intermediate")
-        self.transformer_output = TFXxxOutput(config, name='output')
+        self.transformer_output = TFXxxOutput(config, name="output")
    def call(self, inputs, training=False):
        hidden_states, attention_mask, head_mask = inputs
@@ -98,7 +100,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+    def call(
+        self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False
+    ):
        # We allow three types of multi-inputs:
        # - traditional keyword arguments in the call method
        # - all the arguments provided as a dict in the first positional argument of call
@@ -113,11 +117,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            assert len(inputs) <= 5, "Too many inputs."
        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
+            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get('attention_mask', attention_mask)
+            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
+            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
+            head_mask = inputs.get("head_mask", head_mask)
            assert len(inputs) <= 5, "Too many inputs."
        else:
            input_ids = inputs
@@ -148,7 +152,7 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if not head_mask is None:
+        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
@@ -175,6 +179,7 @@ class TFXxxPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = XxxConfig
    pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "transformer"
@@ -212,7 +217,7 @@ XXX_START_DOCSTRING = r"""    The XXX model was proposed in
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -226,13 +231,13 @@ XXX_INPUTS_DOCSTRING = r"""
            (a) For sequence pairs:
                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
            (b) For single sequences:
                ``tokens:         [CLS] the dog is hairy . [SEP]``
                ``token_type_ids:   0   0   0   0  0     0   0``
            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -263,8 +268,12 @@ XXX_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """
-@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxModel(TFXxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -297,17 +306,22 @@ class TFXxxModel(TFXxxPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
+TFXxxMLMHead = tf.keras.layers.Layer
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class TFXxxForMaskedLM(TFXxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -333,26 +347,30 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel):
        prediction_scores = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
        return outputs  # prediction_scores, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -378,22 +396,23 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
+        self.classifier = tf.keras.layers.Dense(
-                                                kernel_initializer=get_initializer(config.initializer_range),
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-                                                name='classifier')
+        )
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -401,9 +420,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
        return outputs  # logits, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForTokenClassification(TFXxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -429,22 +451,23 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
        scores = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
+        self.classifier = tf.keras.layers.Dense(
-                                                kernel_initializer=get_initializer(config.initializer_range),
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-                                                name='classifier')
+        )
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -452,9 +475,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
        return outputs  # scores, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -482,14 +508,15 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
        start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+        self.qa_outputs = tf.keras.layers.Dense(
-                                                kernel_initializer=get_initializer(config.initializer_range),
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-                                                name='qa_outputs')
+        )
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)

--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -20,22 +20,17 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import json
 import logging
-import math
 import os
-import sys
-import copy
-import itertools
-from io import open
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_xxx import XxxConfig
 from .file_utils import add_start_docstrings
+from .modeling_utils import PreTrainedModel
 logger = logging.getLogger(__name__)
@@ -44,10 +39,11 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
+    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
-    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
+    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
 }
 ####################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
@@ -60,8 +56,10 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
        import numpy as np
        import tensorflow as tf
    except ImportError:
-        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+        logger.error(
-            "https://www.tensorflow.org/install/ for installation instructions.")
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -76,7 +74,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
        arrays.append(array)
    for name, array in zip(names, arrays):
-        name = name.split('/')
+        name = name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
@@ -84,30 +82,30 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
            continue
        pointer = model
        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                l = re.split(r'_(\d+)', m_name)
+                scope_names = re.split(r"_(\d+)", m_name)
            else:
-                l = [m_name]
+                scope_names = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, 'weight')
+                pointer = getattr(pointer, "weight")
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, 'bias')
+                pointer = getattr(pointer, "bias")
-            elif l[0] == 'output_weights':
+            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, 'weight')
+                pointer = getattr(pointer, "weight")
-            elif l[0] == 'squad':
+            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, 'classifier')
+                pointer = getattr(pointer, "classifier")
            else:
                try:
-                    pointer = getattr(pointer, l[0])
+                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
-            if len(l) >= 2:
+            if len(scope_names) >= 2:
-                num = int(l[1])
+                num = int(scope_names[1])
                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
+        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, 'weight')
+            pointer = getattr(pointer, "weight")
-        elif m_name == 'kernel':
+        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
@@ -131,6 +129,14 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+XxxAttention = nn.Module
+XxxIntermediate = nn.Module
+XxxOutput = nn.Module
 class XxxLayer(nn.Module):
    def __init__(self, config):
        super(XxxLayer, self).__init__()
@@ -147,7 +153,6 @@ class XxxLayer(nn.Module):
        return outputs
 ####################################################
 # PreTrainedModel is a sub-class of torch.nn.Module
 # which take care of loading and saving pretrained weights
@@ -157,10 +162,21 @@ class XxxLayer(nn.Module):
 # pointers for your model and the weights initialization
 # method if its not fully covered by PreTrainedModel's default method
 ####################################################
+XxxLayerNorm = torch.nn.LayerNorm
+XxxEmbeddings = nn.Module
+XxxEncoder = nn.Module
+XxxPooler = nn.Module
 class XxxPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = XxxConfig
    pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_xxx
@@ -195,7 +211,7 @@ XXX_START_DOCSTRING = r"""    The XXX model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -209,13 +225,13 @@ XXX_INPUTS_DOCSTRING = r"""
            (a) For sequence pairs:
                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
            (b) For single sequences:
                ``tokens:         [CLS] the dog is hairy . [SEP]``
                ``token_type_ids:   0   0   0   0  0     0   0``
            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -246,8 +262,12 @@ XXX_INPUTS_DOCSTRING = r"""
            than the model's internal embedding lookup matrix.
 """
-@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+@add_start_docstrings(
+    "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxModel(XxxPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -277,6 +297,7 @@ class XxxModel(XxxPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
        super(XxxModel, self).__init__(config)
@@ -300,7 +321,15 @@ class XxxModel(XxxPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -329,7 +358,7 @@ class XxxModel(XxxPreTrainedModel):
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        # Prepare head mask if needed
@@ -342,14 +371,20 @@ class XxxModel(XxxPreTrainedModel):
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+                head_mask = (
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers
        ##################################
        # Replace this with your model code
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
        sequence_output = encoder_outputs[0]
        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
@@ -357,8 +392,9 @@ class XxxModel(XxxPreTrainedModel):
        return outputs  # sequence_output, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
+@add_start_docstrings(
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
+)
 class XxxForMaskedLM(XxxPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -389,6 +425,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
        loss, prediction_scores = outputs[:2]
    """
    def __init__(self, config):
        super(XxxForMaskedLM, self).__init__(config)
@@ -400,15 +437,25 @@ class XxxForMaskedLM(XxxPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
+    def forward(
-                masked_lm_labels=None):
+        self,
+        input_ids=None,
-        outputs = self.transformer(input_ids,
+        attention_mask=None,
-                            attention_mask=attention_mask,
+        token_type_ids=None,
-                            token_type_ids=token_type_ids,
+        position_ids=None,
-                            position_ids=position_ids, 
+        head_mask=None,
-                            head_mask=head_mask,
+        inputs_embeds=None,
-                            inputs_embeds=inputs_embeds)
+        masked_lm_labels=None,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
@@ -422,9 +469,12 @@ class XxxForMaskedLM(XxxPreTrainedModel):
        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForSequenceClassification(XxxPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -456,6 +506,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
        loss, logits = outputs[:2]
    """
    def __init__(self, config):
        super(XxxForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -466,15 +517,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
+    def forward(
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+        self,
+        input_ids=None,
-        outputs = self.transformer(input_ids,
+        attention_mask=None,
-                            attention_mask=attention_mask,
+        token_type_ids=None,
-                            token_type_ids=token_type_ids,
+        position_ids=None,
-                            position_ids=position_ids, 
+        head_mask=None,
-                            head_mask=head_mask,
+        inputs_embeds=None,
-                            inputs_embeds=inputs_embeds)
+        labels=None,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
        pooled_output = outputs[1]
@@ -496,9 +557,12 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+@add_start_docstrings(
+    """Xxx Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForTokenClassification(XxxPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -528,6 +592,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
        loss, scores = outputs[:2]
    """
    def __init__(self, config):
        super(XxxForTokenClassification, self).__init__(config)
        self.num_labels = config.num_labels
@@ -538,15 +603,25 @@ class XxxForTokenClassification(XxxPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
+    def forward(
-                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+        self,
+        input_ids=None,
-        outputs = self.transformer(input_ids,
+        attention_mask=None,
-                            attention_mask=attention_mask,
+        token_type_ids=None,
-                            token_type_ids=token_type_ids,
+        position_ids=None,
-                            position_ids=position_ids, 
+        head_mask=None,
-                            head_mask=head_mask,
+        inputs_embeds=None,
-                            inputs_embeds=inputs_embeds)
+        labels=None,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
        sequence_output = outputs[0]
@@ -569,9 +644,12 @@ class XxxForTokenClassification(XxxPreTrainedModel):
        return outputs  # (loss), scores, (hidden_states), (attentions)
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+    XXX_START_DOCSTRING,
+    XXX_INPUTS_DOCSTRING,
+)
 class XxxForQuestionAnswering(XxxPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -605,14 +683,15 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
        input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
        # a nice puppet
    """
    def __init__(self, config):
        super(XxxForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels
@@ -622,15 +701,26 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
+    def forward(
-                start_positions=None, end_positions=None):
+        self,
+        input_ids=None,
-        outputs = self.transformer(input_ids,
+        attention_mask=None,
-                            attention_mask=attention_mask,
+        token_type_ids=None,
-                            token_type_ids=token_type_ids,
+        position_ids=None,
-                            position_ids=position_ids, 
+        head_mask=None,
-                            head_mask=head_mask,
+        inputs_embeds=None,
-                            inputs_embeds=inputs_embeds)
+        start_positions=None,
+        end_positions=None,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
        sequence_output = outputs[0]

--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -12,61 +12,68 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import XxxConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import XxxConfig, is_tf_available
 if is_tf_available():
-    import tensorflow as tf
+    from transformers.modeling_tf_xxx import (
-    from transformers.modeling_tf_xxx import (TFXxxModel, TFXxxForMaskedLM,
+        TFXxxModel,
-                                               TFXxxForSequenceClassification,
+        TFXxxForMaskedLM,
-                                               TFXxxForTokenClassification,
+        TFXxxForSequenceClassification,
-                                               TFXxxForQuestionAnswering,
+        TFXxxForTokenClassification,
-                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFXxxForQuestionAnswering,
+    )
 @require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
+    all_model_classes = (
-                         TFXxxForSequenceClassification,
+        (
-                         TFXxxForTokenClassification) if is_tf_available() else ()
+            TFXxxModel,
+            TFXxxForMaskedLM,
+            TFXxxForQuestionAnswering,
+            TFXxxForSequenceClassification,
+            TFXxxForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
    class TFXxxModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -120,15 +127,16 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFXxxModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output, pooled_output = model(inputs)
            inputs = [input_ids, input_mask]
@@ -141,78 +149,74 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                "pooled_output": pooled_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+        def create_and_check_xxx_for_masked_lm(
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFXxxForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (prediction_scores,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFXxxForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+        def create_and_check_xxx_for_token_classification(
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFXxxForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["logits"].shape),
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFXxxForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            start_logits, end_logits = model(inputs)
            result = {
                "start_logits": start_logits.numpy(),
                "end_logits": end_logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-                list(result["start_logits"].shape),
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -244,9 +248,10 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        for model_name in ['xxx-base-uncased']:
+        for model_name in ["xxx-base-uncased"]:
            model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -12,59 +12,64 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
-    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
+    from transformers import (
-                                        XxxForNextSentencePrediction, XxxForPreTraining,
+        XxxConfig,
-                                        XxxForQuestionAnswering, XxxForSequenceClassification,
+        XxxModel,
-                                        XxxForTokenClassification, XxxForMultipleChoice)
+        XxxForMaskedLM,
+        XxxForQuestionAnswering,
+        XxxForSequenceClassification,
+        XxxForTokenClassification,
+    )
    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
 @require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
+    all_model_classes = (
-                         XxxForSequenceClassification,
+        (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification)
-                         XxxForTokenClassification) if is_torch_available() else ()
+        if is_torch_available()
+        else ()
+    )
    class XxxModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -118,16 +123,17 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
-        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xxx_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = XxxModel(config=config)
            model.to(torch_device)
            model.eval()
@@ -140,83 +146,98 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+        def create_and_check_xxx_for_masked_lm(
-        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = XxxForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
+        def create_and_check_xxx_for_question_answering(
-        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = XxxForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+            loss, start_logits, end_logits = model(
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
            result = {
                "loss": loss,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-                list(result["start_logits"].size()),
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
            self.check_loss_output(result)
+        def create_and_check_xxx_for_sequence_classification(
-        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = XxxForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
            self.check_loss_output(result)
+        def create_and_check_xxx_for_token_classification(
-        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = XxxForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
            self.parent.assertListEqual(
-                list(result["logits"].size()),
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -252,5 +273,6 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
            model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
@@ -18,10 +18,11 @@ import os
 import unittest
 from io import open
-from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES)
+from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
 from .tokenization_tests_commons import CommonTestCases
 class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
    tokenizer_class = XxxTokenizer
@@ -30,28 +31,39 @@ class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
        super(XxxTokenizationTest, self).setUp()
        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "[UNK]",
-            "##ing", ",", "low", "lowest",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
    def get_tokenizer(self, **kwargs):
        return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
    def get_input_output_texts(self):
-        input_text = u"UNwant\u00E9d,running"
+        input_text = "UNwant\u00E9d,running"
-        output_text = u"unwanted, running"
+        output_text = "unwanted, running"
        return input_text, output_text
    def test_full_tokenizer(self):
        tokenizer = self.tokenizer_class(self.vocab_file)
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import collections
 import logging
 import os
-import unicodedata
 from io import open
 from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
 ####################################################
@@ -34,17 +34,16 @@ logger = logging.getLogger(__name__)
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to pretrained vocabulary URL for all the model shortcut names.
 ####################################################
 PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
+    "vocab_file": {
-    {
+        "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
-        'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
+        "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
-        'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
    }
 }
@@ -52,8 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xxx-base-uncased': 512,
+    "xxx-base-uncased": 512,
-    'xxx-large-uncased': 512,
+    "xxx-large-uncased": 512,
 }
 ####################################################
@@ -62,8 +61,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 # To be used for checkpoint specific configurations.
 ####################################################
 PRETRAINED_INIT_CONFIGURATION = {
-    'xxx-base-uncased': {'do_lower_case': True},
+    "xxx-base-uncased": {"do_lower_case": True},
-    'xxx-large-uncased': {'do_lower_case': True},
+    "xxx-large-uncased": {"do_lower_case": True},
 }
@@ -73,7 +72,7 @@ def load_vocab(vocab_file):
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
+        token = token.rstrip("\n")
        vocab[token] = index
    return vocab
@@ -93,9 +92,17 @@ class XxxTokenizer(PreTrainedTokenizer):
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    def __init__(self, vocab_file, do_lower_case=True,
+    def __init__(
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+        self,
-                 mask_token="[MASK]", **kwargs):
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
        """Constructs a XxxTokenizer.
        Args:
@@ -104,16 +111,22 @@ class XxxTokenizer(PreTrainedTokenizer):
                Whether to lower case the input
                Only has an effect when do_basic_tokenize=True
        """
-        super(XxxTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+        super(XxxTokenizer, self).__init__(
-                                           pad_token=pad_token, cls_token=cls_token,
+            unk_token=unk_token,
-                                           mask_token=mask_token, **kwargs)
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs
+        )
        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
        self.vocab = load_vocab(vocab_file)
    @property
@@ -142,7 +155,7 @@ class XxxTokenizer(PreTrainedTokenizer):
    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
@@ -177,8 +190,10 @@ class XxxTokenizer(PreTrainedTokenizer):
        if already_has_special_tokens:
            if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                raise ValueError(
-                                 "ids is already formated with special tokens for the model.")
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is not None:
@@ -204,15 +219,17 @@ class XxxTokenizer(PreTrainedTokenizer):
        """Save the tokenizer vocabulary to a directory or file."""
        index = 0
        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
        else:
            vocab_file = vocab_path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                    logger.warning(
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
                    index = token_index
-                writer.write(token + u'\n')
+                writer.write(token + "\n")
                index += 1
        return (vocab_file,)
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
 __version__ = "2.3.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
@@ -6,212 +10,377 @@ __version__ = "2.3.0"
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
    import absl.logging
-    absl.logging.set_verbosity('info')
+except ImportError:
-    absl.logging.set_stderrthreshold('info')
-    absl.logging._warn_preinit_stderr = False
-except:
    pass
+else:
+    absl.logging.set_verbosity("info")
+    absl.logging.set_stderrthreshold("info")
+    absl.logging._warn_preinit_stderr = False
 import logging
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
+from .configuration_mmbt import MMBTConfig
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-# Files and general utilities
+# Configurations
-from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+from .configuration_utils import PretrainedConfig
-                         cached_path, add_start_docstrings, add_end_docstrings,
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
+from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
-                         is_tf_available, is_torch_available)
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
+from .data import (
-from .data import (is_sklearn_available,
+    DataProcessor,
-                   InputExample, InputFeatures, DataProcessor,
+    InputExample,
-                   SingleSentenceClassificationProcessor,
+    InputFeatures,
-                   glue_output_modes, glue_convert_examples_to_features,
+    SingleSentenceClassificationProcessor,
-                   glue_processors, glue_tasks_num_labels,
+    SquadExample,
-                   xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
+    SquadFeatures,
-                   squad_convert_examples_to_features, SquadFeatures, 
+    SquadV1Processor,
-                   SquadExample, SquadV1Processor, SquadV2Processor)
+    SquadV2Processor,
+    glue_convert_examples_to_features,
+    glue_output_modes,
+    glue_processors,
+    glue_tasks_num_labels,
+    is_sklearn_available,
+    squad_convert_examples_to_features,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+)
-if is_sklearn_available():
+# Files and general utilities
-    from .data import glue_compute_metrics, xnli_compute_metrics
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    TRANSFORMERS_CACHE,
+    WEIGHTS_NAME,
+    add_end_docstrings,
+    add_start_docstrings,
+    cached_path,
+    is_tf_available,
+    is_torch_available,
+)
 # Model Cards
 from .modelcard import ModelCard
-# Tokenizers
+# TF 2.0 <=> PyTorch conversion utilities
-from .tokenization_utils import (PreTrainedTokenizer)
+from .modeling_tf_pytorch_utils import (
+    convert_tf_weight_name_to_pt_weight_name,
+    load_pytorch_checkpoint_in_tf2_model,
+    load_pytorch_model_in_tf2_model,
+    load_pytorch_weights_in_tf2_model,
+    load_tf2_checkpoint_in_pytorch_model,
+    load_tf2_model_in_pytorch_model,
+    load_tf2_weights_in_pytorch_model,
+)
+# Pipelines
+from .pipelines import (
+    CsvPipelineDataFormat,
+    FeatureExtractionPipeline,
+    JsonPipelineDataFormat,
+    NerPipeline,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+    pipeline,
+)
+from .tokenization_albert import AlbertTokenizer
 from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
-from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_camembert import CamembertTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
-from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
-from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
-from .tokenization_albert import AlbertTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
-from .tokenization_camembert import CamembertTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_roberta import RobertaTokenizer
 from .tokenization_t5 import T5Tokenizer
+from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
+# Tokenizers
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+if is_sklearn_available():
+    from .data import glue_compute_metrics, xnli_compute_metrics
-# Configurations
-from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_mmbt import MMBTConfig
 # Modeling
 if is_torch_available():
-    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
-    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+    from .modeling_auto import (
-                                AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
+        AutoModel,
+        AutoModelForSequenceClassification,
-    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
+        AutoModelForQuestionAnswering,
-                                BertForMaskedLM, BertForNextSentencePrediction,
+        AutoModelWithLMHead,
-                                BertForSequenceClassification, BertForMultipleChoice,
+        AutoModelForTokenClassification,
-                                BertForTokenClassification, BertForQuestionAnswering,
+        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    )
-    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+    from .modeling_bert import (
-                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        BertPreTrainedModel,
-    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
+        BertModel,
-                                    AdaptiveEmbedding,
+        BertForPreTraining,
-                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+        BertForMaskedLM,
-    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
+        BertForNextSentencePrediction,
-                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
+        BertForSequenceClassification,
-                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+        BertForMultipleChoice,
-    from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
+        BertForTokenClassification,
-                                CTRLLMHeadModel,
+        BertForQuestionAnswering,
-                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+        load_tf_weights_in_bert,
-    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                XLNetForSequenceClassification, XLNetForTokenClassification,
+    )
-                                XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
+    from .modeling_openai import (
-                                XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
+        OpenAIGPTPreTrainedModel,
-                                XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+        OpenAIGPTModel,
-    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
+        OpenAIGPTLMHeadModel,
-                            XLMWithLMHeadModel, XLMForSequenceClassification,
+        OpenAIGPTDoubleHeadsModel,
-                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
+        load_tf_weights_in_openai_gpt,
-                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
+    )
-                                RobertaForSequenceClassification, RobertaForMultipleChoice,
+    from .modeling_transfo_xl import (
-                                RobertaForTokenClassification, RobertaForQuestionAnswering,
+        TransfoXLPreTrainedModel,
-                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TransfoXLModel,
-    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
+        TransfoXLLMHeadModel,
-                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+        AdaptiveEmbedding,
-                                DistilBertForTokenClassification,
+        load_tf_weights_in_transfo_xl,
-                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
+    )
-                                CamembertForSequenceClassification, CamembertForMultipleChoice,
+    from .modeling_gpt2 import (
-                                CamembertForTokenClassification,
+        GPT2PreTrainedModel,
-                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        GPT2Model,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+        load_tf_weights_in_gpt2,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_xlnet import (
+        XLNetPreTrainedModel,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForMultipleChoice,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForQuestionAnswering,
+        load_tf_weights_in_xlnet,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm import (
+        XLMPreTrainedModel,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForSequenceClassification,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_roberta import (
+        RobertaForMaskedLM,
+        RobertaModel,
+        RobertaForSequenceClassification,
+        RobertaForMultipleChoice,
+        RobertaForTokenClassification,
+        RobertaForQuestionAnswering,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_distilbert import (
+        DistilBertPreTrainedModel,
+        DistilBertForMaskedLM,
+        DistilBertModel,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForTokenClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_camembert import (
+        CamembertForMaskedLM,
+        CamembertModel,
+        CamembertForSequenceClassification,
+        CamembertForMultipleChoice,
+        CamembertForTokenClassification,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
-    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
+    from .modeling_t5 import (
-                              load_tf_weights_in_t5,
+        T5PreTrainedModel,
-                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+        T5Model,
-    from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
+        T5WithLMHeadModel,
-                                AlbertForQuestionAnswering,
+        load_tf_weights_in_t5,
-                                load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
+    )
-                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
+    from .modeling_albert import (
+        AlbertPreTrainedModel,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+        load_tf_weights_in_albert,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm_roberta import (
+        XLMRobertaForMaskedLM,
+        XLMRobertaModel,
+        XLMRobertaForMultipleChoice,
+        XLMRobertaForSequenceClassification,
+        XLMRobertaForTokenClassification,
+    )
    from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
    # Optimization
-    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
+    from .optimization import (
-                               get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
 # TensorFlow
 if is_tf_available():
    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
-    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
+    from .modeling_tf_auto import (
-                                   TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFAutoModel,
+        TFAutoModelForSequenceClassification,
-    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
+        TFAutoModelForQuestionAnswering,
-                                   TFBertModel, TFBertForPreTraining,
+        TFAutoModelWithLMHead,
-                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
+        TFAutoModelForTokenClassification,
-                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
+        TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
+    )
-                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
-                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
-                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
-                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
-                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
-                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
-                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
-                                    TFXLNetModel, TFXLNetLMHeadModel,
-                                    TFXLNetForSequenceClassification,
-                                    TFXLNetForTokenClassification,
-                                    TFXLNetForQuestionAnsweringSimple,
-                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
-                                  TFXLMModel, TFXLMWithLMHeadModel,
-                                  TFXLMForSequenceClassification,
-                                  TFXLMForQuestionAnsweringSimple,
-                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
-                                      TFRobertaModel, TFRobertaForMaskedLM,
-                                      TFRobertaForSequenceClassification,
-                                      TFRobertaForTokenClassification,
-                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
-                                         TFDistilBertModel, TFDistilBertForMaskedLM,
-                                         TFDistilBertForSequenceClassification,
-                                         TFDistilBertForTokenClassification,
-                                         TFDistilBertForQuestionAnswering,
-                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_ctrl import (TFCTRLPreTrainedModel, TFCTRLModel,
-                                    TFCTRLLMHeadModel,
-                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
-                                     TFAlbertForSequenceClassification,
-                                    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
-                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
-    # Optimization
+    from .modeling_tf_bert import (
-    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
+        TFBertPreTrainedModel,
+        TFBertMainLayer,
+        TFBertEmbeddings,
+        TFBertModel,
+        TFBertForPreTraining,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
-# TF 2.0 <=> PyTorch conversion utilities
+    from .modeling_tf_gpt2 import (
-from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
+        TFGPT2PreTrainedModel,
-                                        load_pytorch_checkpoint_in_tf2_model,
+        TFGPT2MainLayer,
-                                        load_pytorch_weights_in_tf2_model,
+        TFGPT2Model,
-                                        load_pytorch_model_in_tf2_model,
+        TFGPT2LMHeadModel,
-                                        load_tf2_checkpoint_in_pytorch_model,
+        TFGPT2DoubleHeadsModel,
-                                        load_tf2_weights_in_pytorch_model,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                        load_tf2_model_in_pytorch_model)
+    )
+    from .modeling_tf_openai import (
+        TFOpenAIGPTPreTrainedModel,
+        TFOpenAIGPTMainLayer,
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_transfo_xl import (
+        TFTransfoXLPreTrainedModel,
+        TFTransfoXLMainLayer,
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_xlnet import (
+        TFXLNetPreTrainedModel,
+        TFXLNetMainLayer,
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_xlm import (
+        TFXLMPreTrainedModel,
+        TFXLMMainLayer,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_roberta import (
+        TFRobertaPreTrainedModel,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_distilbert import (
+        TFDistilBertPreTrainedModel,
+        TFDistilBertMainLayer,
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertForQuestionAnswering,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_ctrl import (
+        TFCTRLPreTrainedModel,
+        TFCTRLModel,
+        TFCTRLLMHeadModel,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_albert import (
+        TFAlbertPreTrainedModel,
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_t5 import TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    # Optimization
+    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
-# Pipelines
-from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \
-    Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline
 if not is_tf_available() and not is_torch_available():
-    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
+    logger.warning(
-                   "Models won't be available and only tokenizers, configuration"
+        "Neither PyTorch nor TensorFlow >= 2.0 have been found."
-                   "and file/data utilities can be used.")
+        "Models won't be available and only tokenizers, configuration"
+        "and file/data utilities can be used."
+    )
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
 # coding: utf8
 def main():
    import sys
    if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]:
        print(
-        "First argument to `transformers` command line interface should be one of: \n"
+            "First argument to `transformers` command line interface should be one of: \n"
-        ">> convert serve train predict")
+            ">> convert serve train predict"
+        )
    if sys.argv[1] == "convert":
        from transformers.commands import convert
        convert(sys.argv)
    elif sys.argv[1] == "train":
        from transformers.commands import train
        train(sys.argv)
    elif sys.argv[1] == "serve":
        pass
@@ -19,7 +24,6 @@ def main():
        # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
        # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
        # # Register commands
        # ServeCommand.register_subcommand(commands_parser)
@@ -33,5 +37,6 @@ def main():
        # service = args.func(args)
        # service.run()
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/transformers/commands/__init__.py
+++ b/transformers/commands/__init__.py
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser
 class BaseTransformersCLICommand(ABC):
    @staticmethod
    @abstractmethod

--- a/transformers/commands/convert.py
+++ b/transformers/commands/convert.py
 from argparse import ArgumentParser, Namespace
 from logging import getLogger
-from transformers import AutoModel, AutoTokenizer
 from transformers.commands import BaseTransformersCLICommand
@@ -11,12 +9,12 @@ def convert_command_factory(args: Namespace):
    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
    :return: ServeCommand
    """
-    return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output,
+    return ConvertCommand(
-                          args.config, args.finetuning_task_name)
+        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
+    )
 class ConvertCommand(BaseTransformersCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
        """
@@ -24,25 +22,39 @@ class ConvertCommand(BaseTransformersCLICommand):
        :param parser: Root parser to register command-specific arguments
        :return:
        """
-        train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original "
+        train_parser = parser.add_parser(
-                                                         "author checkpoints to Transformesr PyTorch checkpoints.")
+            "convert",
-        train_parser.add_argument('--model_type', type=str, required=True,
+            help="CLI tool to run convert model from original "
-                                  help='Model\'s type.')
+            "author checkpoints to Transformesr PyTorch checkpoints.",
-        train_parser.add_argument('--tf_checkpoint', type=str, required=True,
+        )
-                                  help='TensorFlow checkpoint path or folder.')
+        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
-        train_parser.add_argument('--pytorch_dump_output', type=str, required=True,
+        train_parser.add_argument(
-                                  help='Path to the PyTorch savd model output.')
+            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
-        train_parser.add_argument('--config', type=str, default="",
+        )
-                                  help='Configuration file path or folder.')
+        train_parser.add_argument(
-        train_parser.add_argument('--finetuning_task_name', type=str, default=None,
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
-                                  help='Optional fine-tuning task name if the TF model was a finetuned model.')
+        )
+        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
+        train_parser.add_argument(
+            "--finetuning_task_name",
+            type=str,
+            default=None,
+            help="Optional fine-tuning task name if the TF model was a finetuned model.",
+        )
        train_parser.set_defaults(func=convert_command_factory)
-    def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str,
+    def __init__(
-                 config: str, finetuning_task_name: str, *args):
+        self,
-        self._logger = getLogger('transformers-cli/converting')
+        model_type: str,
+        tf_checkpoint: str,
+        pytorch_dump_output: str,
+        config: str,
+        finetuning_task_name: str,
+        *args
+    ):
+        self._logger = getLogger("transformers-cli/converting")
-        self._logger.info('Loading model {}'.format(model_type))
+        self._logger.info("Loading model {}".format(model_type))
        self._model_type = model_type
        self._tf_checkpoint = tf_checkpoint
        self._pytorch_dump_output = pytorch_dump_output
@@ -52,63 +64,80 @@ class ConvertCommand(BaseTransformersCLICommand):
    def run(self):
        if self._model_type == "bert":
            try:
-                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
            except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                msg = (
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
                raise ImportError(msg)
            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "gpt":
-            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
-            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint,
+                convert_openai_checkpoint_to_pytorch,
-                                                    self._config,
+            )
-                                                    self._pytorch_dump_output)
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "transfo_xl":
            try:
-                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
+                    convert_transfo_xl_checkpoint_to_pytorch,
+                )
            except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                msg = (
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
                raise ImportError(msg)
-            if 'ckpt' in self._tf_checkpoint.lower():
+            if "ckpt" in self._tf_checkpoint.lower():
                TF_CHECKPOINT = self._tf_checkpoint
                TF_DATASET_FILE = ""
            else:
                TF_DATASET_FILE = self._tf_checkpoint
                TF_CHECKPOINT = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT,
+            convert_transfo_xl_checkpoint_to_pytorch(
-                                                        self._config,
+                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
-                                                        self._pytorch_dump_output,
+            )
-                                                        TF_DATASET_FILE)
        elif self._model_type == "gpt2":
            try:
-                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
+                    convert_gpt2_checkpoint_to_pytorch,
+                )
            except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                msg = (
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
                raise ImportError(msg)
            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
        elif self._model_type == "xlnet":
            try:
-                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
+                    convert_xlnet_checkpoint_to_pytorch,
+                )
            except ImportError:
-                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                msg = (
-                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
                raise ImportError(msg)
-            convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint,
+            convert_xlnet_checkpoint_to_pytorch(
-                                                self._config,
+                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
-                                                self._pytorch_dump_output,
+            )
-                                                self._finetuning_task_name)
        elif self._model_type == "xlm":
-            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
+                convert_xlm_checkpoint_to_pytorch,
+            )
            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
        else:

--- a/transformers/commands/download.py
+++ b/transformers/commands/download.py
@@ -8,13 +8,16 @@ def download_command_factory(args):
 class DownloadCommand(BaseTransformersCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser('download')
+        download_parser = parser.add_parser("download")
-        download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models')
+        download_parser.add_argument(
-        download_parser.add_argument('--force',  action='store_true', help='Force the model to be download even if already in cache-dir')
+            "--cache-dir", type=str, default=None, help="Path to location to store the models"
-        download_parser.add_argument('model', type=str, help='Name of the model to download')
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
        download_parser.set_defaults(func=download_command_factory)
    def __init__(self, model: str, cache: str, force: bool):
@@ -26,4 +29,4 @@ class DownloadCommand(BaseTransformersCLICommand):
        from transformers import AutoModel, AutoTokenizer
        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
\ No newline at end of file
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
@@ -2,7 +2,7 @@ import logging
 from argparse import ArgumentParser
 from transformers.commands import BaseTransformersCLICommand
-from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS
+from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -10,52 +10,72 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 def try_infer_format_from_ext(path: str):
    if not path:
-        return 'pipe'
+        return "pipe"
    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
        if path.endswith(ext):
            return ext
    raise Exception(
-        'Unable to determine file format from file extension {}. '
+        "Unable to determine file format from file extension {}. "
-        'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
    )
 def run_command_factory(args):
-    nlp = pipeline(task=args.task,
+    nlp = pipeline(
-                   model=args.model if args.model else None,
+        task=args.task,
-                   config=args.config,
+        model=args.model if args.model else None,
-                   tokenizer=args.tokenizer,
+        config=args.config,
-                   device=args.device)
+        tokenizer=args.tokenizer,
-    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
+        device=args.device,
-    reader = PipelineDataFormat.from_str(format=format,
+    )
-                                         output_path=args.output,
+    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
-                                         input_path=args.input,
+    reader = PipelineDataFormat.from_str(
-                                         column=args.column if args.column else nlp.default_input_names,
+        format=format,
-                                         overwrite=args.overwrite)
+        output_path=args.output,
+        input_path=args.input,
+        column=args.column if args.column else nlp.default_input_names,
+        overwrite=args.overwrite,
+    )
    return RunCommand(nlp, reader)
 class RunCommand(BaseTransformersCLICommand):
    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
        self._nlp = nlp
        self._reader = reader
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
-        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
+        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
-        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
+        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
-        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
+        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
-        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
+        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
-        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
+        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
-        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
+        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
-        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
+        run_parser.add_argument(
-        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
+            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
-        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
+        )
-        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        run_parser.add_argument(
-        run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.')
+            "--column",
+            type=str,
+            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
+        )
+        run_parser.add_argument(
+            "--format",
+            type=str,
+            default="infer",
+            choices=PipelineDataFormat.SUPPORTED_FORMATS,
+            help="Input format to read from",
+        )
+        run_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
        run_parser.set_defaults(func=run_command_factory)
    def run(self):
@@ -71,9 +91,6 @@ class RunCommand(BaseTransformersCLICommand):
        # Saving data
        if self._nlp.binary_output:
            binary_path = self._reader.save_binary(outputs)
-            logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path))
+            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
        else:
            self._reader.save(outputs)