Merge branch 'master' into master

d3f24dfa · Lysandre Debut · GitHub · 4b543c30 · ecc4f1bd · d3f24dfa
Unverified Commit d3f24dfa authored Oct 03, 2019 by Lysandre Debut Committed by GitHub Oct 03, 2019
20 changed files
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert pytorch checkpoints to TensorFlow """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import argparse
+import tensorflow as tf
+from transformers import is_torch_available, cached_path
+from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+if is_torch_available():
+    import torch
+    import numpy as np
+    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
+        None, None, None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None, None,
+        None, None, None,)
+import logging
+logging.basicConfig(level=logging.INFO)
+MODEL_CLASSES = {
+    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+}
+def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
+    if model_type not in MODEL_CLASSES:
+        raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
+    config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
+    # Initialise TF model
+    if config_file in aws_config_map:
+        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
+    config = config_class.from_json_file(config_file)
+    config.output_hidden_states = True
+    config.output_attentions = True
+    print("Building TensorFlow model from configuration: {}".format(str(config)))
+    tf_model = model_class(config)
+    # Load weights from tf checkpoint
+    if pytorch_checkpoint_path in aws_model_maps:
+        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
+    tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
+    if compare_with_pt_model:
+        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+        tf_inputs = tf.constant(inputs_list)
+        tfo = tf_model(tf_inputs, training=False)  # build the network
+        pt_model = pt_model_class.from_pretrained(None,
+                                                  config=config,
+                                                  state_dict=torch.load(pytorch_checkpoint_path,
+                                                                        map_location='cpu'))
+        pt_inputs = torch.tensor(inputs_list)
+        with torch.no_grad():
+            pto = pt_model(pt_inputs)
+        np_pt = pto[0].detach().numpy()
+        np_tf = tfo[0].numpy()
+        diff = np.amax(np.abs(np_pt - np_tf))
+        print("Max absolute difference between models outputs {}".format(diff))
+        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
+    # Save pytorch-model
+    print("Save TensorFlow model to {}".format(tf_dump_path))
+    tf_model.save_weights(tf_dump_path, save_format='h5')
+def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
+                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
+    if args_model_type is None:
+        model_types = list(MODEL_CLASSES.keys())
+    else:
+        model_types = [args_model_type]
+    for j, model_type in enumerate(model_types, start=1):
+        print("=" * 100)
+        print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
+        print("=" * 100)
+        if model_type not in MODEL_CLASSES:
+            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
+        config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
+        if model_shortcut_names_or_path is None:
+            model_shortcut_names_or_path = list(aws_model_maps.keys())
+        if config_shortcut_names_or_path is None:
+            config_shortcut_names_or_path = model_shortcut_names_or_path
+        for i, (model_shortcut_name, config_shortcut_name) in enumerate(
+                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
+            print("-" * 100)
+            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
+                if not only_convert_finetuned_models:
+                    print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
+                    continue
+                model_type = model_shortcut_name
+            elif only_convert_finetuned_models:
+                print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
+                continue
+            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
+            print("-" * 100)
+            if config_shortcut_name in aws_config_map:
+                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
+            else:
+                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
+            if model_shortcut_name in aws_model_maps:
+                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
+            else:
+                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
+            convert_pt_checkpoint_to_tf(model_type,
+                                        model_file,
+                                        config_file,
+                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
+                                        compare_with_pt_model=compare_with_pt_model)
+            os.remove(config_file)
+            os.remove(model_file)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output Tensorflow dump file.")
+    parser.add_argument("--model_type",
+                        default = None,
+                        type = str,
+                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
+    parser.add_argument("--pytorch_checkpoint_path",
+                        default = None,
+                        type = str,
+                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
+                               "If not given, will download and convert all the checkpoints from AWS.")
+    parser.add_argument("--config_file",
+                        default = None,
+                        type = str,
+                        help = "The config json file corresponding to the pre-trained model. \n"
+                               "This specifies the model architecture. If not given and "
+                               "--pytorch_checkpoint_path is not given or is a shortcut name"
+                               "use the configuration associated to the shortcut name on the AWS")
+    parser.add_argument("--compare_with_pt_model",
+                        action='store_true',
+                        help = "Compare Tensorflow and PyTorch model predictions.")
+    parser.add_argument("--use_cached_models",
+                        action='store_true',
+                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--only_convert_finetuned_models",
+                        action='store_true',
+                        help = "Only convert finetuned models.")
+    args = parser.parse_args()
+    # if args.pytorch_checkpoint_path is not None:
+    #     convert_pt_checkpoint_to_tf(args.model_type.lower(),
+    #                                 args.pytorch_checkpoint_path,
+    #                                 args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
+    #                                 args.tf_dump_path,
+    #                                 compare_with_pt_model=args.compare_with_pt_model,
+    #                                 use_cached_models=args.use_cached_models)
+    # else:
+    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
+                                        args.tf_dump_path,
+                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
+                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
+                                        compare_with_pt_model=args.compare_with_pt_model,
+                                        use_cached_models=args.use_cached_models,
+                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ import torch
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers import (BertConfig, BertEncoder,
+from transformers import (BertConfig, BertEncoder,
                                                BertIntermediate, BertLayer,
                                                BertModel, BertOutput,
                                                BertSelfAttention,
                                                BertSelfOutput)
-from pytorch_transformers import (RobertaEmbeddings,
+from transformers import (RobertaEmbeddings,
                                                   RobertaForMaskedLM,
                                                   RobertaForSequenceClassification,
                                                   RobertaModel)

--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ from io import open
 import torch
-import pytorch_transformers.tokenization_transfo_xl as data_utils
+import transformers.tokenization_transfo_xl as data_utils
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                      load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 if sys.version_info[0] == 2:
    import cPickle as pickle

--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 import logging
 logging.basicConfig(level=logging.INFO)
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    # Load checkpoint
    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
-    model = chkpt['model']
+    state_dict = chkpt['model']
+    # We have the base model one level deeper than the original XLM repository
+    two_levels_state_dict = {}
+    for k, v in state_dict.items():
+        if 'pred_layer' in k:
+            two_levels_state_dict[k] = v
+        else:
+            two_levels_state_dict['transformer.' + k] = v
    config = chkpt['params']
    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model, pytorch_weights_dump_path)
+    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:

--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                    XLNetConfig,
                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                    XLNetForSequenceClassification,

--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
+from .processors import InputExample, InputFeatures, DataProcessor
+from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .metrics import is_sklearn_available
+if is_sklearn_available():
+    from .metrics import glue_compute_metrics
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+import sys
+import logging
+logger = logging.getLogger(__name__)
+try:
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import matthews_corrcoef, f1_score
+    _has_sklearn = True
+except (AttributeError, ImportError) as e:
+    logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
+    _has_sklearn = False
+def is_sklearn_available():
+    return _has_sklearn
+if _has_sklearn:
+    def simple_accuracy(preds, labels):
+        return (preds == labels).mean()
+    def acc_and_f1(preds, labels):
+        acc = simple_accuracy(preds, labels)
+        f1 = f1_score(y_true=labels, y_pred=preds)
+        return {
+            "acc": acc,
+            "f1": f1,
+            "acc_and_f1": (acc + f1) / 2,
+        }
+    def pearson_and_spearman(preds, labels):
+        pearson_corr = pearsonr(preds, labels)[0]
+        spearman_corr = spearmanr(preds, labels)[0]
+        return {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+    def glue_compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "cola":
+            return {"mcc": matthews_corrcoef(labels, preds)}
+        elif task_name == "sst-2":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mrpc":
+            return acc_and_f1(preds, labels)
+        elif task_name == "sts-b":
+            return pearson_and_spearman(preds, labels)
+        elif task_name == "qqp":
+            return acc_and_f1(preds, labels)
+        elif task_name == "mnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mnli-mm":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "qnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "rte":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "wnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        else:
+            raise KeyError(task_name)
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
+from .utils import InputExample, InputFeatures, DataProcessor
+from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -13,84 +13,154 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+""" GLUE processors and helpers """
-from __future__ import absolute_import, division, print_function
-import csv
 import logging
 import os
-import sys
-from io import open
-from scipy.stats import pearsonr, spearmanr
+from .utils import DataProcessor, InputExample, InputFeatures
-from sklearn.metrics import matthews_corrcoef, f1_score
+from ...file_utils import is_tf_available
+if is_tf_available():
+    import tensorflow as tf
 logger = logging.getLogger(__name__)
-class InputExample(object):
+def glue_convert_examples_to_features(examples, tokenizer,
-    """A single training/test example for simple sequence classification."""
+                                      max_length=512,
+                                      task=None,
+                                      label_list=None,
+                                      output_mode=None,
+                                      pad_on_left=False,
+                                      pad_token=0,
+                                      pad_token_segment_id=0,
+                                      mask_padding_with_zero=True):
+    """
+    Loads a data file into a list of ``InputFeatures``
+    Args:
+        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length
+        task: GLUE task
+        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+        pad_token: Padding token
+        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
+        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+            actual values)
+    Returns:
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+        containing the task-specific features. If the input is a list of ``InputExamples``, will return
+        a list of task-specific ``InputFeatures`` which can be fed to the model.
-    def __init__(self, guid, text_a, text_b=None, label=None):
+    """
-        """Constructs a InputExample.
+    is_tf_dataset = False
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        is_tf_dataset = True
-        Args:
+    if task is not None:
-            guid: Unique id for the example.
+        processor = glue_processors[task]()
-            text_a: string. The untokenized text of the first sequence. For single
+        if label_list is None:
-            sequence tasks, only this sequence must be specified.
+            label_list = processor.get_labels()
-            text_b: (Optional) string. The untokenized text of the second sequence.
+            logger.info("Using label list %s for task %s" % (label_list, task))
-            Only must be specified for sequence pair tasks.
+        if output_mode is None:
-            label: (Optional) string. The label of the example. This should be
+            output_mode = glue_output_modes[task]
-            specified for train and dev examples, but not for test examples.
+            logger.info("Using output mode %s for task %s" % (output_mode, task))
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
+    label_map = {label: i for i, label in enumerate(label_list)}
-class InputFeatures(object):
+    features = []
-    """A single set of features of data."""
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+        if is_tf_dataset:
+            example = processor.get_example_from_tensor_dict(example)
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            truncate_first_sequence=True  # We're truncating the first sequence in priority
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        self.input_ids = input_ids
+        # tokens are attended to.
-        self.input_mask = input_mask
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-        self.segment_ids = segment_ids
-        self.label_id = label_id
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-class DataProcessor(object):
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-    """Base class for data converters for sequence classification data sets."""
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
-    def get_train_examples(self, data_dir):
+        if output_mode == "classification":
-        """Gets a collection of `InputExample`s for the train set."""
+            label = label_map[example.label]
-        raise NotImplementedError()
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
-    def get_dev_examples(self, data_dir):
+        if ex_index < 5:
-        """Gets a collection of `InputExample`s for the dev set."""
+            logger.info("*** Example ***")
-        raise NotImplementedError()
+            logger.info("guid: %s" % (example.guid))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label))
-    def get_labels(self):
+        features.append(
-        """Gets the list of labels for this data set."""
+                InputFeatures(input_ids=input_ids,
-        raise NotImplementedError()
+                              attention_mask=attention_mask,
+                              token_type_ids=token_type_ids,
-    @classmethod
+                              label=label))
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
+    if is_tf_available() and is_tf_dataset:
-        with open(input_file, "r", encoding="utf-8-sig") as f:
+        def gen():
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            for ex in features:
-            lines = []
+                yield  ({'input_ids': ex.input_ids,
-            for line in reader:
+                         'attention_mask': ex.attention_mask,
-                if sys.version_info[0] == 2:
+                         'token_type_ids': ex.token_type_ids},
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                        ex.label)
-                lines.append(line)
-            return lines
+        return tf.data.Dataset.from_generator(gen,
+            ({'input_ids': tf.int32,
+              'attention_mask': tf.int32,
+              'token_type_ids': tf.int32},
+             tf.int64),
+            ({'input_ids': tf.TensorShape([None]),
+              'attention_mask': tf.TensorShape([None]),
+              'token_type_ids': tf.TensorShape([None])},
+             tf.TensorShape([])))
+    return features
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
@@ -124,6 +194,13 @@ class MrpcProcessor(DataProcessor):
 class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -167,6 +244,13 @@ class MnliMismatchedProcessor(MnliProcessor):
 class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            None,
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -196,6 +280,13 @@ class ColaProcessor(DataProcessor):
 class Sst2Processor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            None,
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -227,6 +318,13 @@ class Sst2Processor(DataProcessor):
 class StsbProcessor(DataProcessor):
    """Processor for the STS-B data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -259,6 +357,13 @@ class StsbProcessor(DataProcessor):
 class QqpProcessor(DataProcessor):
    """Processor for the QQP data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['question1'].numpy().decode('utf-8'),
+                            tensor_dict['question2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -294,6 +399,13 @@ class QqpProcessor(DataProcessor):
 class QnliProcessor(DataProcessor):
    """Processor for the QNLI data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['question'].numpy().decode('utf-8'),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -302,7 +414,7 @@ class QnliProcessor(DataProcessor):
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
            "dev_matched")
    def get_labels(self):
@@ -327,6 +439,13 @@ class QnliProcessor(DataProcessor):
 class RteProcessor(DataProcessor):
    """Processor for the RTE data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -359,6 +478,13 @@ class RteProcessor(DataProcessor):
 class WnliProcessor(DataProcessor):
    """Processor for the WNLI data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -387,198 +513,19 @@ class WnliProcessor(DataProcessor):
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
+glue_tasks_num_labels = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
-def convert_examples_to_features(examples, label_list, max_seq_length,
+glue_processors = {
-                                 tokenizer, output_mode,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]',
-                                 cls_token_segment_id=1,
-                                 sep_token='[SEP]',
-                                 sep_token_extra=False,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 sequence_a_segment_id=0, 
-                                 sequence_b_segment_id=1,
-                                 mask_padding_with_zero=True):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-    """
-    label_map = {label : i for i, label in enumerate(label_list)}
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        tokens_a = tokenizer.tokenize(example.text_a)
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-            special_tokens_count = 3 if sep_token_extra else 2
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = tokens_a + [sep_token]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-        if tokens_b:
-            tokens += tokens_b + [sep_token]
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-        if cls_token_at_end:
-            tokens = tokens + [cls_token]
-            segment_ids = segment_ids + [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            segment_ids = [cls_token_segment_id] + segment_ids
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = float(example.label)
-        else:
-            raise KeyError(output_mode)
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label_id))
-        features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
-    return features
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
-processors = {
    "cola": ColaProcessor,
    "mnli": MnliProcessor,
    "mnli-mm": MnliMismatchedProcessor,
@@ -591,7 +538,7 @@ processors = {
    "wnli": WnliProcessor,
 }
-output_modes = {
+glue_output_modes = {
    "cola": "classification",
    "mnli": "classification",
    "mnli-mm": "classification",
@@ -603,15 +550,3 @@ output_modes = {
    "rte": "classification",
    "wnli": "classification",
 }
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+import sys
+import copy
+import json
+class InputExample(object):
+    """
+    A single training/test example for simple sequence classification.
+    Args:
+        guid: Unique id for the example.
+        text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+        text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+        label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+class InputFeatures(object):
+    """
+    A single set of features of data.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        label: Label corresponding to the input
+    """
+    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.label = label
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """Gets an example from a dict with tensorflow tensors
+        Args:
+            tensor_dict: Keys and values should match the corresponding Glue
+                tensorflow_dataset examples.
+        """
+        raise NotImplementedError()
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
 import requests
 from tqdm import tqdm
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+try:
+    import tensorflow as tf
+    assert int(tf.__version__[0]) >= 2
+    _tf_available = True  # pylint: disable=invalid-name
+    logger.info("TensorFlow version {} available.".format(tf.__version__))
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
+try:
+    import torch
+    _torch_available = True  # pylint: disable=invalid-name
+    logger.info("PyTorch version {} available.".format(torch.__version__))
+except ImportError:
+    _torch_available = False  # pylint: disable=invalid-name
 try:
    from torch.hub import _get_torch_home
    torch_cache_home = _get_torch_home()
@@ -30,7 +48,7 @@ except ImportError:
    torch_cache_home = os.path.expanduser(
        os.getenv('TORCH_HOME', os.path.join(
            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+default_cache_path = os.path.join(torch_cache_home, 'transformers')
 try:
    from urllib.parse import urlparse
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
                                                        default_cache_path))
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 WEIGHTS_NAME = "pytorch_model.bin"
+TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def is_torch_available():
+    return _torch_available
+def is_tf_available():
+    return _tf_available
 if not six.PY2:
    def add_start_docstrings(*docstr):
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
+    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
+    so that TF 2.0 can identify it as a HDF5 file
+    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    """
    url_bytes = url.encode('utf-8')
    url_hash = sha256(url_bytes)
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
        etag_hash = sha256(etag_bytes)
        filename += '.' + etag_hash.hexdigest()
+    if url.endswith('.h5'):
+        filename += '.h5'
    return filename
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):

--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 class AutoModel(object):
    r"""
-        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -84,23 +84,23 @@ class AutoModel(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -120,7 +120,7 @@ class AutoModel(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -157,7 +157,7 @@ class AutoModel(object):
 class AutoModelWithLMHead(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        :class:`~transformers.AutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
 class AutoModelForSequenceClassification(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
 class AutoModelForQuestionAnswering(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::

--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -118,26 +118,27 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 def gelu(x):
-    """Implementation of the gelu activation function.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 def swish(x):
    return x * torch.sigmoid(x)
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
-try:
+BertLayerNorm = torch.nn.LayerNorm
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
-except (ImportError, AttributeError) as e:
-    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-    BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
@@ -195,7 +196,7 @@ class BertSelfAttention(nn.Module):
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
@@ -207,8 +208,9 @@ class BertSelfAttention(nn.Module):
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        if attention_mask is not None:
-        attention_scores = attention_scores + attention_mask
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -275,7 +277,7 @@ class BertAttention(nn.Module):
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(self, input_tensor, attention_mask, head_mask=None):
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
        self_outputs = self.self(input_tensor, attention_mask, head_mask)
        attention_output = self.output(self_outputs[0], input_tensor)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -318,7 +320,7 @@ class BertLayer(nn.Module):
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
        attention_output = attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
@@ -334,7 +336,7 @@ class BertEncoder(nn.Module):
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
@@ -480,9 +482,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -506,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:

--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
        https://medium.com/huggingface/distilbert-8cf3380435b5
    Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -649,7 +649,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:3]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
+                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
@@ -280,9 +281,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -290,9 +291,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **past**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -367,6 +368,13 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)
    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@@ -378,6 +386,7 @@ class GPT2Model(GPT2PreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
+            attention_mask = attention_mask.view(-1, input_shape[-1])
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
@@ -407,14 +416,9 @@ class GPT2Model(GPT2PreTrainedModel):
        else:
            head_mask = [None] * self.config.n_layer
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = 0
@@ -490,7 +494,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+        from transformers import GPT2Tokenizer, GPT2LMHeadModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -586,7 +590,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:

--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -43,6 +43,9 @@ class RobertaEmbeddings(BertEmbeddings):
    def __init__(self, config):
        super(RobertaEmbeddings, self).__init__(config)
        self.padding_idx = 1
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
+                                                padding_idx=self.padding_idx)
    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        seq_length = input_ids.size(1)
@@ -77,9 +80,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -102,8 +105,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
@@ -361,9 +364,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
                ``token_type_ids:   0   0   0   0  0     0   0``
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Segment token indices to indicate first and second portions of the inputs.
            The second dimension of the input (`num_choices`) indicates the number of choices to score.

--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
+from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
+from .file_utils import add_start_docstrings
+logger = logging.getLogger(__name__)
+class TFAutoModel(object):
+    r"""
+        :class:`~transformers.TFAutoModel` is a generic model class
+        that will be instantiated as one of the base model classes of the library
+        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
+            - contains `roberta`: TFRobertaModel (RoBERTa model)
+            - contains `bert`: TFBertModel (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetModel (XLNet model)
+            - contains `xlm`: TFXLMModel (XLM model)
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModel is designed to be instantiated "
+            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
+            - contains `roberta`: TFRobertaModel (RoBERTa model)
+            - contains `bert`: TFTFBertModel (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetModel (XLNet model)
+            - contains `xlm`: TFXLMModel (XLM model)
+        Params:
+            pretrained_model_name_or_path: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+        Examples::
+            model = TFAutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+class TFAutoModelWithLMHead(object):
+    r"""
+        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: TFBertForMaskedLM (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
+            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: TFBertForMaskedLM (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
+            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+        Params:
+            pretrained_model_name_or_path: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+        Examples::
+            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+class TFAutoModelForSequenceClassification(object):
+    r"""
+        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: TFBertForSequenceClassification (Bert model)
+            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: TFBertForSequenceClassification (Bert model)
+            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+        Params:
+            pretrained_model_name_or_path: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+        Examples::
+            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+class TFAutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: TFBertForQuestionAnswering (Bert model)
+            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: TFBertForQuestionAnswering (Bert model)
+            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+        Params:
+            pretrained_model_name_or_path: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+        Examples::
+            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 BERT model. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+import numpy as np
+import tensorflow as tf
+from .configuration_bert import BertConfig
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+logger = logging.getLogger(__name__)
+TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+}
+def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+def gelu_new(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+def swish(x):
+    return x * tf.sigmoid(x)
+ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
+          "relu": tf.keras.activations.relu,
+          "swish": tf.keras.layers.Activation(swish),
+          "gelu_new": tf.keras.layers.Activation(gelu_new)}
+class TFBertEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config, **kwargs):
+        super(TFBertEmbeddings, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.initializer_range = config.initializer_range
+        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+                                                             config.hidden_size,
+                                                             embeddings_initializer=get_initializer(self.initializer_range),
+                                                             name='position_embeddings')
+        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
+                                                               config.hidden_size,
+                                                               embeddings_initializer=get_initializer(self.initializer_range),
+                                                               name='token_type_embeddings')
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range))
+        super(TFBertEmbeddings, self).build(input_shape)
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids = inputs
+        seq_length = tf.shape(input_ids)[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+        words_embeddings = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = tf.shape(inputs)[0]
+        length = tf.shape(inputs)[1]
+        x = tf.reshape(inputs, [-1, self.hidden_size])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+class TFBertSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertSelfAttention, self).__init__(**kwargs)
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+        self.num_attention_heads = config.num_attention_heads
+        assert config.hidden_size % config.num_attention_heads == 0
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = tf.keras.layers.Dense(self.all_head_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='query')
+        self.key = tf.keras.layers.Dense(self.all_head_size,
+                                         kernel_initializer=get_initializer(config.initializer_range),
+                                         name='key')
+        self.value = tf.keras.layers.Dense(self.all_head_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='value')
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+        batch_size = tf.shape(hidden_states)[0]
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
+        attention_scores = attention_scores / tf.math.sqrt(dk)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        context_layer = tf.reshape(context_layer, 
+                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+class TFBertSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertSelfOutput, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class TFBertAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertAttention, self).__init__(**kwargs)
+        self.self_attention = TFBertSelfAttention(config, name='self')
+        self.dense_output = TFBertSelfOutput(config, name='output')
+    def prune_heads(self, heads):
+        raise NotImplementedError
+    def call(self, inputs, training=False):
+        input_tensor, attention_mask, head_mask = inputs
+        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
+        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class TFBertIntermediate(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertIntermediate, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.intermediate_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class TFBertOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertOutput, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class TFBertLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertLayer, self).__init__(**kwargs)
+        self.attention = TFBertAttention(config, name='attention')
+        self.intermediate = TFBertIntermediate(config, name='intermediate')
+        self.bert_output = TFBertOutput(config, name='output')
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+class TFBertEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertEncoder, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
+            hidden_states = layer_outputs[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # outputs, (hidden states), (attentions)
+class TFBertPooler(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertPooler, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           activation='tanh',
+                                           name='dense')
+    def call(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        return pooled_output
+class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class TFBertLMPredictionHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFBertLMPredictionHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.transform = TFBertPredictionHeadTransform(config, name='transform')
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFBertLMPredictionHead, self).build(input_shape)
+    def call(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+class TFBertMLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFBertMLMHead, self).__init__(**kwargs)
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
+    def call(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class TFBertNSPHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertNSPHead, self).__init__(**kwargs)
+        self.seq_relationship = tf.keras.layers.Dense(2,
+                                                      kernel_initializer=get_initializer(config.initializer_range),
+                                                      name='seq_relationship')
+    def call(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class TFBertMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertMainLayer, self).__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = TFBertEmbeddings(config, name='embeddings')
+        self.encoder = TFBertEncoder(config, name='encoder')
+        self.pooler = TFBertPooler(config, name='pooler')
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+        if attention_mask is None:
+            attention_mask = tf.fill(tf.shape(input_ids), 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+class TFBertPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = BertConfig
+    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_bert_pt_weights_in_tf2
+    base_model_prefix = "bert"
+BERT_START_DOCSTRING = r"""    The BERT model was proposed in
+    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+            (a) For sequence pairs:
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+            (b) For single sequences:
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                ``token_type_ids:   0   0   0   0  0     0   0``
+            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertModel(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertModel
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertModel.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertModel, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        return outputs
+@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForPreTraining(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ```tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **seq_relationship_scores**: ```tf.Tensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ```tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ```tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForPreTraining
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        seq_relationship_score = self.nsp(pooled_output)
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForMaskedLM(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMaskedLM
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **seq_relationship_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForNextSentencePrediction
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        pooled_output = outputs[1]
+        seq_relationship_score = self.nsp(pooled_output)
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # seq_relationship_score, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForSequenceClassification(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForSequenceClassification
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        logits = self.classifier(pooled_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # logits, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForMultipleChoice(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **classification_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMultipleChoice
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
+        outputs = model(input_ids)
+        classification_scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(1,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+        num_choices = tf.shape(input_ids)[1]
+        seq_length = tf.shape(input_ids)[2]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
+        outputs = self.bert(flat_inputs, training=training)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # reshaped_logits, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForTokenClassification(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForTokenClassification
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # scores, (hidden_states), (attentions)
+@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForQuestionAnswering(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForQuestionAnswering
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 DistilBERT model
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import math
+import copy
+import sys
+from io import open
+import itertools
+import numpy as np
+import tensorflow as tf
+from .configuration_distilbert import DistilBertConfig
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+logger = logging.getLogger(__name__)
+TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
+}
+### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+def gelu_new(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+    tf_inputs = [inputs_list, attns_list]
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+class TFEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFEmbeddings, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.dim = config.dim
+        self.initializer_range = config.initializer_range
+        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
+                                                  config.dim,
+                                                  initializer_range=config.initializer_range,
+                                                  name='word_embeddings')  # padding_idx=0)
+        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+                                                             config.dim,
+                                                             embeddings_initializer=get_initializer(config.initializer_range),
+                                                             name='position_embeddings')
+        if config.sinusoidal_pos_embds:
+            raise NotImplementedError
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.dim],
+                initializer=get_initializer(self.initializer_range))
+        super(TFEmbeddings, self).build(input_shape)
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+    def _embedding(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        input_ids: tf.Tensor(bs, max_seq_length)
+            The token ids to embed.
+        Outputs
+        -------
+        embeddings: tf.Tensor(bs, max_seq_length, dim)
+            The embedded tokens (plus position embeddings, no token_type embeddings)
+        """
+        if not isinstance(inputs, (tuple, list)):
+            input_ids = inputs
+            position_ids = None
+        else:
+            input_ids, position_ids = inputs
+        seq_length = tf.shape(input_ids)[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        word_embeddings = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
+        embeddings = word_embeddings + position_embeddings            # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
+        return embeddings
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = tf.shape(inputs)[0]
+        length = tf.shape(inputs)[1]
+        x = tf.reshape(inputs, [-1, self.dim])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+        self.q_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="q_lin")
+        self.k_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="k_lin")
+        self.v_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="v_lin")
+        self.out_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="out_lin")
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        raise NotImplementedError
+    def call(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        query: tf.Tensor(bs, seq_length, dim)
+        key: tf.Tensor(bs, seq_length, dim)
+        value: tf.Tensor(bs, seq_length, dim)
+        mask: tf.Tensor(bs, seq_length)
+        Outputs
+        -------
+        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
+            Attention weights
+        context: tf.Tensor(bs, seq_length, dim)
+            Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        query, key, value, mask, head_mask = inputs
+        bs, q_length, dim = shape_list(query)
+        k_length = shape_list(key)[1]
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        # assert key.size() == value.size()
+        dim_per_head = self.dim // self.n_heads
+        assert 2 <= len(tf.shape(mask)) <= 3
+        causal = (len(tf.shape(mask)) == 3)
+        mask_reshape = [bs, 1, 1, k_length]
+        def shape(x):
+            """ separate heads """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+        def unshape(x):
+            """ group heads """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
+        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)             # (bs, q_length, dim)
+        context = self.out_lin(context)        # (bs, q_length, dim)
+        if self.output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+class TFFFN(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFFFN, self).__init__(**kwargs)
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
+                                          kernel_initializer=get_initializer(config.initializer_range),
+                                          name="lin1")
+        self.lin2 = tf.keras.layers.Dense(config.dim,
+                                          kernel_initializer=get_initializer(config.initializer_range),
+                                          name="lin2")
+        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
+        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+        return x
+class TFTransformerBlock(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFTransformerBlock, self).__init__(**kwargs)
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.hidden_dim = config.hidden_dim
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.activation = config.activation
+        self.output_attentions = config.output_attentions
+        assert config.dim % config.n_heads == 0
+        self.attention = TFMultiHeadSelfAttention(config, name="attention")
+        self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
+        self.ffn = TFFFN(config, name="ffn")
+        self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
+    def call(self, inputs, training=False):  # removed: src_enc=None, src_len=None
+        """
+        Parameters
+        ----------
+        x: tf.Tensor(bs, seq_length, dim)
+        attn_mask: tf.Tensor(bs, seq_length)
+        Outputs
+        -------
+        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
+            The attention weights
+        ffn_output: tf.Tensor(bs, seq_length, dim)
+            The output of the transformer block contextualization.
+        """
+        x, attn_mask, head_mask = inputs
+        # Self-Attention
+        sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
+        if self.output_attentions:
+            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            # assert type(sa_output) == tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+        output = (ffn_output,)
+        if self.output_attentions:
+            output = (sa_weights,) + output
+        return output
+class TFTransformer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFTransformer, self).__init__(**kwargs)
+        self.n_layers = config.n_layers
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
+                      for i in range(config.n_layers)]
+    def call(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        x: tf.Tensor(bs, seq_length, dim)
+            Input sequence embedded.
+        attn_mask: tf.Tensor(bs, seq_length)
+            Attention mask on the sequence.
+        Outputs
+        -------
+        hidden_state: tf.Tensor(bs, seq_length, dim)
+            Sequence of hiddens states in the last (top) layer
+        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
+            Tuple of length n_layers with the hidden states from each layer.
+            Optional: only if output_hidden_states=True
+        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
+            Tuple of length n_layers with the attention weights from each layer
+            Optional: only if output_attentions=True
+        """
+        x, attn_mask, head_mask = inputs
+        all_hidden_states = ()
+        all_attentions = ()
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+            layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
+            hidden_state = layer_outputs[-1]
+            if self.output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+        outputs = (hidden_state,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+class TFDistilBertMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFDistilBertMainLayer, self).__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
+        self.transformer = TFTransformer(config, name="transformer") # Encoder
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+    def call(self, inputs, attention_mask=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            head_mask = inputs[2] if len(inputs) > 2 else head_mask
+            assert len(inputs) <= 3, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 3, "Too many inputs."
+        else:
+            input_ids = inputs
+        if attention_mask is None:
+            attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
+        tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
+        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
+### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+class TFDistilBertPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = DistilBertConfig
+    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_distilbert_pt_weights_in_tf2
+    base_model_prefix = "distilbert"
+DISTILBERT_START_DOCSTRING = r"""
+    DistilBERT is a small, fast, cheap and light Transformer model
+    trained by distilling Bert base. It has 40% less parameters than
+    `bert-base-uncased`, runs 60% faster while preserving over 95% of
+    Bert's performances as measured on the GLUE language understanding benchmark.
+    Here are the differences between the interface of Bert and DistilBert:
+    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
+    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+    For more information on DistilBERT, please refer to our
+    `detailed blog post`_
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+    .. _`detailed blog post`:
+        https://medium.com/huggingface/distilbert-8cf3380435b5
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+    Parameters:
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
+            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertModel(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertModel
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
+    def call(self, inputs, **kwargs):
+        outputs = self.distilbert(inputs, **kwargs)
+        return outputs
+class TFDistilBertLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFDistilBertLMHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFDistilBertLMHead, self).build(input_shape)
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.vocab_size = config.vocab_size
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.vocab_transform = tf.keras.layers.Dense(config.dim,
+                                                     kernel_initializer=get_initializer(config.initializer_range),
+                                                     name="vocab_transform")
+        self.act = tf.keras.layers.Activation(gelu)
+        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
+        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
+        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)
+        outputs = (prediction_logits,) + distilbert_output[1:]
+        return outputs  # logits, (hidden_states), (attentions)
+@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+                         the pooled output) e.g. for GLUE tasks. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFDistilBertForSequenceClassification
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.pre_classifier = tf.keras.layers.Dense(config.dim,
+                                                    kernel_initializer=get_initializer(config.initializer_range),
+                                                    activation='relu',
+                                                    name="pre_classifier")
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name="classifier")
+        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]                    # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
+        logits = self.classifier(pooled_output)              # (bs, dim)
+        outputs = (logits,) + distilbert_output[1:]
+        return outputs  # logits, (hidden_states), (attentions)
+@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+                         the hidden-states output to compute `span start logits` and `span end logits`). """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+        assert config.num_labels == 2
+        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+        outputs = (start_logits, end_logits,) + distilbert_output[1:]
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)