Merge branch 'master' into fix-xlnet-squad2.0

562f8640 · Thomas Wolf · GitHub · ca99a2d5 · 8618bf15 · 562f8640
Unverified Commit 562f8640 authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
20 changed files
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_roberta import RobertaConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
+}
+
+
+class XLMRobertaConfig(RobertaConfig):
+    pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
    """Configuration class to store the configuration of a ``XLNetModel``.

    Args:
-        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
        d_model: Size of the encoder layers and the pooler layer.
        n_layer: Number of hidden layers in the Transformer encoder.
        n_head: Number of attention heads for each attention layer in
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP

    def __init__(self,
-                 vocab_size_or_config_json_file=32000,
+                 vocab_size=32000,
                 d_model=1024,
                 n_layer=24,
                 n_head=16,
                 d_inner=4096,
-                 max_position_embeddings=512,
                 ff_activation="gelu",
                 untie_r=True,
                 attn_type="bi",
-
                 initializer_range=0.02,
                 layer_norm_eps=1e-12,
-
                 dropout=0.1,
                 mem_len=None,
                 reuse_len=None,
                 bi_data=False,
                 clamp_len=-1,
                 same_length=False,
-
-                 finetuning_task=None,
-                 num_labels=2,
                 summary_type='last',
                 summary_use_proj=True,
                 summary_activation='tanh',
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
        """Constructs XLNetConfig.
        """
        super(XLNetConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                setattr(config, key, value)
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.d_model = d_model
-            self.n_layer = n_layer
-            self.n_head = n_head
-            assert d_model % n_head == 0
-            self.d_head = d_model // n_head
-            self.ff_activation = ff_activation
-            self.d_inner = d_inner
-            self.untie_r = untie_r
-            self.attn_type = attn_type
-
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-
-            self.dropout = dropout
-            self.mem_len = mem_len
-            self.reuse_len = reuse_len
-            self.bi_data = bi_data
-            self.clamp_len = clamp_len
-            self.same_length = same_length
-
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_last_dropout = summary_last_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.n_head = n_head
+        assert d_model % n_head == 0
+        self.d_head = d_model // n_head
+        self.ff_activation = ff_activation
+        self.d_inner = d_inner
+        self.untie_r = untie_r
+        self.attn_type = attn_type
+
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.dropout = dropout
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
+
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top

    @property
    def max_position_embeddings(self):
        return -1

    @property
-    def vocab_size(self):
-        return self.n_token
+    def n_token(self):  # Backward compatibility
+        return self.vocab_size

-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
+    @n_token.setter
+    def n_token(self, value):  # Backward compatibility
+        self.vocab_size = value

    @property
    def hidden_size(self):

--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ALBERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = AlbertConfig.from_json_file(albert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = AlbertForMaskedLM(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--albert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained ALBERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.albert_config_file,
+                                     args.pytorch_dump_path)
+ 
\ No newline at end of file
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -32,8 +32,10 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
                                  TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)

 if is_torch_available():
    import torch
@@ -45,8 +47,10 @@ if is_torch_available():
                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -55,8 +59,10 @@ else:
    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) = (
+    DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
        None, None, None, None,
        None, None,
        None, None,
@@ -64,7 +70,9 @@ else:
        None, None,
        None, None,
        None, None, None,
-        None, None, None,
+        None, None, None, None,
+        None, None,
+        None, None,
        None, None)


@@ -85,7 +93,10 @@ MODEL_CLASSES = {
    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }

 def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
@@ -110,23 +121,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
    tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)

    if compare_with_pt_model:
-        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-        tf_inputs = tf.constant(inputs_list)
-        tfo = tf_model(tf_inputs, training=False)  # build the network
+        tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network

-        pt_model = pt_model_class.from_pretrained(None,
+        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
+        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
                                                  config=config,
-                                                  state_dict=torch.load(pytorch_checkpoint_path,
-                                                                        map_location='cpu'))
-        pt_inputs = torch.tensor(inputs_list)
+                                                  state_dict=state_dict)
+
        with torch.no_grad():
-            pto = pt_model(pt_inputs)
+            pto = pt_model(**pt_model.dummy_inputs)

-        np_pt = pto[0].detach().numpy()
+        np_pt = pto[0].numpy()
        np_tf = tfo[0].numpy()
        diff = np.amax(np.abs(np_pt - np_tf))
        print("Max absolute difference between models outputs {}".format(diff))
-        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
+        assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)

    # Save pytorch-model
    print("Save TensorFlow model to {}".format(tf_dump_path))
@@ -134,7 +143,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file


 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"

    if args_model_type is None:
@@ -182,13 +191,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc

            if os.path.isfile(model_shortcut_name):
                model_shortcut_name = 'converted_model'
+
            convert_pt_checkpoint_to_tf(model_type=model_type,
                                        pytorch_checkpoint_path=model_file,
                                        config_file=config_file,
                                        tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                        compare_with_pt_model=compare_with_pt_model)
-            os.remove(config_file)
-            os.remove(model_file)
+            if remove_cached_files:
+                os.remove(config_file)
+                os.remove(model_file)


 if __name__ == "__main__":
@@ -221,6 +232,9 @@ if __name__ == "__main__":
    parser.add_argument("--use_cached_models",
                        action='store_true',
                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--remove_cached_files",
+                        action='store_true',
+                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
    parser.add_argument("--only_convert_finetuned_models",
                        action='store_true',
                        help = "Only convert finetuned models.")
@@ -240,4 +254,5 @@ if __name__ == "__main__":
                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                        compare_with_pt_model=args.compare_with_pt_model,
                                        use_cached_models=args.use_cached_models,
+                                        remove_cached_files=args.remove_cached_files,
                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -20,6 +20,13 @@ import argparse
 import logging
 import numpy as np
 import torch
+import pathlib
+
+import fairseq
+from packaging import version
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")

 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
    """
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
    config = BertConfig(
-        vocab_size_or_config_json_file=50265,
+        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_

    # Now let's copy all the weights.
    # Embeddings
-    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
        ### self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert(
-            roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
+            roberta_layer.self_attn.k_proj.weight.data.shape == \
+            roberta_layer.self_attn.q_proj.weight.data.shape == \
+            roberta_layer.self_attn.v_proj.weight.data.shape == \
+            torch.Size((config.hidden_size, config.hidden_size))
        )
-        # we use three distinct linear layers so we split the source layer here.
-        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
-        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
-        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
-        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
-        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
-        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        ### self-attention output
        self_output: BertSelfOutput = layer.attention.output
@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
    if not success:
        raise Exception("Something went wRoNg")

+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)


--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+# coding=utf-8
+# Copyright 2018 The T5 authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert T5 checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from transformers import T5Config, T5Model, load_tf_weights_in_t5
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = T5Config.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = T5Model(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained T5 model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.config_file,
+                                     args.pytorch_dump_path)
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
-from .processors import InputExample, InputFeatures, DataProcessor
+from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
+from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels

 from .metrics import is_sklearn_available
 if is_sklearn_available():
-    from .metrics import glue_compute_metrics
+    from .metrics import glue_compute_metrics, xnli_compute_metrics
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -81,3 +81,11 @@ if _has_sklearn:
            return {"acc": simple_accuracy(preds, labels)}
        else:
            raise KeyError(task_name)
+
+
+    def xnli_compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "xnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        else:
+            raise KeyError(task_name)
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
+""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
+modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""

-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Load SQuAD dataset. """
-
-from __future__ import absolute_import, division, print_function

 import json
 import logging
@@ -24,480 +14,371 @@ import math
 import collections
 from io import open
 from tqdm import tqdm
+import string
+import re

 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

-# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
-
 logger = logging.getLogger(__name__)


-class SquadExample(object):
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
+    Computes the exact and f1 scores from the examples and the model predictions
    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = ['']
+
+        if qas_id not in preds:
+            print('Missing prediction for %s' % qas_id)
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores

-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (
-            self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 cls_index,
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-        self.paragraph_len = paragraph_len
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True,
-                                 sequence_a_is_doc=False):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-    # cnt_pos, cnt_neg = 0, 0
-    # max_N, max_M = 1024, 1024
-    # f = np.zeros((max_N, max_M), dtype=np.float32)
-
-    features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
-
-        # if example_index % 100 == 0:
-        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
-
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores.values()) / total),
+            ('f1', 100.0 * sum(f1_scores.values()) / total),
+            ('total', total),
+        ])
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+            ('total', total),
+        ])
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-
-            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = []
-
-            # CLS token at the beginning
-            if not cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = 0
-
-            # XLNet: P SEP Q SEP CLS
-            # Others: CLS Q SEP P SEP
-            if not sequence_a_is_doc:
-                # Query
-                tokens += query_tokens
-                segment_ids += [sequence_a_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-            # Paragraph
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                if not sequence_a_is_doc:
-                    segment_ids.append(sequence_b_segment_id)
-                else:
-                    segment_ids.append(sequence_a_segment_id)
-                p_mask.append(0)
-            paragraph_len = doc_span.length
-
-            if sequence_a_is_doc:
-                # SEP token
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-                tokens += query_tokens
-                segment_ids += [sequence_b_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
-
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_b_segment_id)
-            p_mask.append(1)
-
-            # CLS token at the end
-            if cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = len(tokens) - 1  # Index of classification token
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(pad_token)
-                input_mask.append(0 if mask_padding_with_zero else 1)
-                segment_ids.append(pad_token_segment_id)
-                p_mask.append(1)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            span_is_impossible = example.is_impossible
-            start_position = None
-            end_position = None
-            if is_training and not span_is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    span_is_impossible = True
-                else:
-                    if sequence_a_is_doc:
-                        doc_offset = 0
-                    else:
-                        doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-
-            if is_training and span_is_impossible:
-                start_position = cls_index
-                end_position = cls_index
-
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and span_is_impossible:
-                    logger.info("impossible example")
-                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    cls_index=cls_index,
-                    p_mask=p_mask,
-                    paragraph_len=paragraph_len,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=span_is_impossible))
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
+        preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
+        preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval['best_exact'] = best_exact
+    main_eval['best_exact_thresh'] = exact_thresh
+    main_eval['best_f1'] = best_f1
+    main_eval['best_f1_thresh'] = f1_thresh
+    main_eval['has_ans_exact'] = has_ans_exact
+    main_eval['has_ans_f1'] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval['best_exact'] = best_exact
+    main_eval['best_exact_thresh'] = exact_thresh
+    main_eval['best_f1'] = best_f1
+    main_eval['best_f1_thresh'] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = {k: 0.0 for k in preds}
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, 'HasAns')
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, 'NoAns')
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
    #
-    # However, this is not always possible. Consider the following:
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
+    # We don't want to return `orig_text` because it contains the extra "'s".
    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
+    # What we really want to return is "Steve Smith".
    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []

-    return cur_span_index == best_span_index
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x

+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs

-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])

-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
-                      version_2_with_negative, null_score_diff_threshold):
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold
+):
    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
    logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -626,12 +507,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                        text="",
                        start_logit=null_start_logit,
                        end_logit=null_end_logit))
-                
+
            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
+            if len(nbest) == 1:
                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                             _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@@ -688,18 +569,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
    return all_predictions


-# For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple("RawResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index",
-     "end_top_log_probs", "end_top_index", "cls_logits"])
-
-
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
-                                max_answer_length, output_prediction_file,
-                                output_nbest_file,
-                                output_null_log_odds_file, orig_data_file,
-                                start_n_top, end_n_top, version_2_with_negative,
-                                tokenizer, verbose_logging):
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging
+):
    """ XLNet write prediction logic (more complex than Bert's).
        Write final predictions to the json file and log-odds of null if needed.

@@ -708,7 +592,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
+         "start_log_prob", "end_log_prob"])

    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
@@ -745,12 +629,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s

            for i in range(start_n_top):
                for j in range(end_n_top):
-                    start_log_prob = result.start_top_log_probs[i]
+                    start_log_prob = result.start_logits[i]
                    start_index = result.start_top_index[i]

                    j_index = i * end_n_top + j

-                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_log_prob = result.end_logits[j_index]
                    end_index = result.end_top_index[j_index]

                    # We could hypothetically create invalid predictions, e.g., predict
@@ -791,7 +675,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s

            # XLNet un-tokenizer
            # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
            # tok_start_to_orig_index = feature.tok_start_to_orig_index
            # tok_end_to_orig_index = feature.tok_end_to_orig_index
            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
@@ -811,7 +695,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case,
                                        verbose_logging)

            if final_text in seen_predictions:
@@ -871,146 +760,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

-    with open(orig_data_file, "r", encoding='utf-8') as reader:
-        orig_data = json.load(reader)["data"]
-
-    qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
-    out_eval = {}
-
-    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
-
-    return out_eval
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
+    return all_predictions
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
-from .utils import InputExample, InputFeatures, DataProcessor
+from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
-
+from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
-                yield  ({'input_ids': ex.input_ids,
+                yield ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
+from tqdm import tqdm
+import collections
+import logging
+import os
+import json
+import numpy as np
+
+from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
+from .utils import DataProcessor, InputExample, InputFeatures
+from ...file_utils import is_tf_available, is_torch_available
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model.
+    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset,
+            if 'tf': returns a tf.data.Dataset
+
+    Returns:
+        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+
+    Example::
+
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples(data_dir)
+
+        features = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
+    """
+
+    # Defining helper methods
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
+        if is_training and not example.is_impossible:
+            # Get start and end position
+            start_position = example.start_position
+            end_position = example.end_position
+
+            # If the answer cannot be found in the text, then skip this example.
+            actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+            cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+                continue
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+            )
+
+        spans = []
+
+        truncated_query = tokenizer.encode(
+            example.question_text, add_special_tokens=False, max_length=max_query_length
+        )
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
+
+        span_doc_tokens = all_doc_tokens
+        while len(spans) * doc_stride < len(all_doc_tokens):
+
+            encoded_dict = tokenizer.encode_plus(
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+                max_length=max_seq_length,
+                return_overflowing_tokens=True,
+                pad_to_max_length=True,
+                stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+                truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
+            )
+
+            paragraph_len = min(
+                len(all_doc_tokens) - len(spans) * doc_stride,
+                max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+            )
+
+            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                non_padded_ids = encoded_dict["input_ids"]
+
+            tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+            token_to_orig_map = {}
+            for i in range(paragraph_len):
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+                token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+            encoded_dict["paragraph_len"] = paragraph_len
+            encoded_dict["tokens"] = tokens
+            encoded_dict["token_to_orig_map"] = token_to_orig_map
+            encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+            encoded_dict["token_is_max_context"] = {}
+            encoded_dict["start"] = len(spans) * doc_stride
+            encoded_dict["length"] = paragraph_len
+
+            spans.append(encoded_dict)
+
+            if "overflowing_tokens" not in encoded_dict:
+                break
+            span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+        for doc_span_index in range(len(spans)):
+            for j in range(spans[doc_span_index]["paragraph_len"]):
+                is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+                index = (
+                    j
+                    if tokenizer.padding_side == "left"
+                    else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                )
+                spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+        for span in spans:
+            # Identify the position of the CLS token
+            cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = np.array(span["token_type_ids"])
+
+            p_mask = np.minimum(p_mask, 1)
+
+            if tokenizer.padding_side == "right":
+                # Limit positive values to one
+                p_mask = 1 - p_mask
+
+            p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
+
+            # Set the CLS index to '0'
+            p_mask[cls_index] = 0
+
+            span_is_impossible = example.is_impossible
+            start_position = 0
+            end_position = 0
+            if is_training and not span_is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = span["start"]
+                doc_end = span["start"] + span["length"] - 1
+                out_of_span = False
+
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                    out_of_span = True
+
+                if out_of_span:
+                    start_position = cls_index
+                    end_position = cls_index
+                    span_is_impossible = True
+                else:
+                    if tokenizer.padding_side == "left":
+                        doc_offset = 0
+                    else:
+                        doc_offset = len(truncated_query) + sequence_added_tokens
+
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            features.append(
+                SquadFeatures(
+                    span["input_ids"],
+                    span["attention_mask"],
+                    span["token_type_ids"],
+                    cls_index,
+                    p_mask.tolist(),
+                    example_index=example_index,
+                    unique_id=unique_id,
+                    paragraph_len=span["paragraph_len"],
+                    token_is_max_context=span["token_is_max_context"],
+                    tokens=span["tokens"],
+                    token_to_orig_map=span["token_to_orig_map"],
+                    start_position=start_position,
+                    end_position=end_position,
+                )
+            )
+
+            unique_id += 1
+
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise ImportError("Pytorch must be installed to return a pytorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    }, {
+                        "start_position": ex.start_position,
+                        "end_position": ex.end_position,
+                        "cls_index": ex.cls_index,
+                        "p_mask": ex.p_mask,
+                    }
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+                {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
+            ),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                {
+                    "start_position": tf.TensorShape([]),
+                    "end_position": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                },
+            ),
+        )
+
+    return features
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set.
+    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            evaluate: boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples::
+
+            import tensorflow_datasets as tfds
+            dataset = tfds.load("squad")
+
+            training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        """
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    if "is_impossible" in qa:
+                        is_impossible = qa["is_impossible"]
+                    else:
+                        is_impossible = False
+
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start end end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures(object):
+    """
+    Single squad example features to be fed to a model.
+    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
+            If a token does not have their maximum context in this feature object, it means that another feature object
+            has more information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index 
+        end_position: end of the answer token index 
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+
+
+class SquadResult(object):
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -18,6 +18,11 @@ import csv
 import sys
 import copy
 import json
+import logging
+
+from ...file_utils import is_tf_available, is_torch_available
+
+logger = logging.getLogger(__name__)

 class InputExample(object):
    """
@@ -64,7 +69,7 @@ class InputFeatures(object):
        label: Label corresponding to the input
    """

-    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
@@ -86,34 +91,6 @@ class InputFeatures(object):
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
-        This method converts examples to the correct format."""
-        if len(self.get_labels()) > 1:
-            example.label = self.get_labels()[int(example.label)]
-        return example
-
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
@@ -125,3 +102,215 @@ class DataProcessor(object):
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
+
+
+class SingleSentenceClassificationProcessor(DataProcessor):
+    """ Generic processor for a single sentence classification data set."""
+    def __init__(self, labels=None, examples=None, mode='classification', verbose=False):
+        self.labels = [] if labels is None else labels
+        self.examples = [] if examples is None else examples
+        self.mode = mode
+        self.verbose = verbose
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return SingleSentenceClassificationProcessor(labels=self.labels,
+                                                         examples=self.examples[idx])
+        return self.examples[idx]
+
+    @classmethod
+    def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1,
+                        column_id=None, skip_first_row=False, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(file_name,
+                                        split_name=split_name,
+                                        column_label=column_label,
+                                        column_text=column_text,
+                                        column_id=column_id,
+                                        skip_first_row=skip_first_row,
+                                        overwrite_labels=True,
+                                        overwrite_examples=True)
+        return processor
+
+    @classmethod
+    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts_or_text_and_labels, labels=labels)
+        return processor
+
+    def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None,
+                              skip_first_row=False, overwrite_labels=False, overwrite_examples=False):
+        lines = self._read_tsv(file_name)
+        if skip_first_row:
+            lines = lines[1:]
+        texts = []
+        labels = []
+        ids = []
+        for (i, line) in enumerate(lines):
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
+            else:
+                guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
+                ids.append(guid)
+
+        return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples)
+
+    def add_examples(self, texts_or_text_and_labels, labels=None, ids=None,
+                     overwrite_labels=False, overwrite_examples=False):
+        assert labels is None or len(texts_or_text_and_labels) == len(labels)
+        assert ids is None or len(texts_or_text_and_labels) == len(ids)
+        if ids is None:
+            ids = [None] * len(texts_or_text_and_labels)
+        if labels is None:
+            labels = [None] * len(texts_or_text_and_labels)
+        examples = []
+        added_labels = set()
+        for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids):
+            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
+                text, label = text_or_text_and_label
+            else:
+                text = text_or_text_and_label
+            added_labels.add(label)
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
+
+        # Update examples
+        if overwrite_examples:
+            self.examples = examples
+        else:
+            self.examples.extend(examples)
+
+        # Update labels
+        if overwrite_labels:
+            self.labels = list(added_labels)
+        else:
+            self.labels = list(set(self.labels).union(added_labels))
+
+        return self.examples
+
+    def get_features(self,
+                     tokenizer,
+                     max_length=None,
+                     pad_on_left=False,
+                     pad_token=0,
+                     mask_padding_with_zero=True,
+                     return_tensors=None):
+        """
+        Convert examples in a list of ``InputFeatures``
+
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            task: GLUE task
+            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+                actual values)
+
+        Returns:
+            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+            containing the task-specific features. If the input is a list of ``InputExamples``, will return
+            a list of task-specific ``InputFeatures`` which can be fed to the model.
+
+        """
+        if max_length is None:
+            max_length = tokenizer.max_len
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for (ex_index, example) in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info("Tokenizing example %d", ex_index)
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)):
+            if ex_index % 10000 == 0:
+                logger.info("Writing example %d", ex_index)
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length)
+            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length)
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info("guid: %s" % (example.guid))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+                logger.info("label: %s (id = %d)" % (example.label, label))
+
+            features.append(
+                    InputFeatures(input_ids=input_ids,
+                                  attention_mask=attention_mask,
+                                  label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == 'tf':
+            if not is_tf_available():
+                raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+            def gen():
+                for ex in features:
+                    yield  ({'input_ids': ex.input_ids,
+                            'attention_mask': ex.attention_mask},
+                            ex.label)
+
+            dataset = tf.data.Dataset.from_generator(gen,
+                    ({'input_ids': tf.int32,
+                    'attention_mask': tf.int32},
+                    tf.int64),
+                    ({'input_ids': tf.TensorShape([None]),
+                    'attention_mask': tf.TensorShape([None])},
+                    tf.TensorShape([])))
+            return dataset
+        elif return_tensors == 'pt':
+            if not is_torch_available():
+                raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
+        else:
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
--- a/transformers/data/processors/xnli.py
+++ b/transformers/data/processors/xnli.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XNLI utils (dataset loading and evaluation) """
+
+from __future__ import absolute_import, division, print_function
+
+import logging
+import os
+
+from .utils import DataProcessor, InputExample
+
+logger = logging.getLogger(__name__)
+
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI dataset.
+    Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
+
+    def __init__(self, language, train_language = None):
+        self.language = language
+        self.train_language = train_language
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lg = self.language if self.train_language is None else self.train_language
+        lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % ('train', i)
+            text_a = line[0]
+            text_b = line[1]
+            label = "contradiction" if line[2] == "contradictory" else line[2]
+            assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            language = line[0]
+            if language != self.language:
+                continue
+            guid = "%s-%s" % ('test', i)
+            text_a = line[6]
+            text_b = line[7]
+            label = line[1]
+            assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+xnli_processors = {
+    "xnli": XnliProcessor,
+}
+
+xnli_output_modes = {
+    "xnli": "classification",
+}
+
+xnli_tasks_num_labels = {
+    "xnli": 3,
+}
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -10,10 +10,9 @@ import json
 import logging
 import os
 import six
-import shutil
 import tempfile
 import fnmatch
-from functools import wraps
+from functools import partial, wraps
 from hashlib import sha256
 from io import open

@@ -21,25 +20,38 @@ import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
 import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
+from contextlib import contextmanager
+from . import __version__

-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+from filelock import FileLock

-try:
-    import tensorflow as tf
-    assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
-    _tf_available = True  # pylint: disable=invalid-name
-    logger.info("TensorFlow version {} available.".format(tf.__version__))
-except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

 try:
-    import torch
-    _torch_available = True  # pylint: disable=invalid-name
-    logger.info("PyTorch version {} available.".format(torch.__version__))
+    os.environ.setdefault('USE_TORCH', 'YES')
+    if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
+        import torch
+        _torch_available = True  # pylint: disable=invalid-name
+        logger.info("PyTorch version {} available.".format(torch.__version__))
+    else:
+        logger.info("USE_TORCH override through env variable, disabling PyTorch")
+        _torch_available = False
 except ImportError:
    _torch_available = False  # pylint: disable=invalid-name

+try:
+    os.environ.setdefault('USE_TF', 'YES')
+    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
+        import tensorflow as tf
+        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+        _tf_available = True  # pylint: disable=invalid-name
+        logger.info("TensorFlow version {} available.".format(tf.__version__))
+    else:
+        logger.info("USE_TF override through env variable, disabling Tensorflow")
+        _tf_available = False
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name

 try:
    from torch.hub import _get_torch_home
@@ -71,11 +83,20 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
+MODEL_CARD_NAME = "modelcard.json"
+
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
+

 def is_torch_available():
    return _torch_available

 def is_tf_available():
+
    return _tf_available

 if not six.PY2:
@@ -102,12 +123,25 @@ else:
            return fn
        return docstring_decorator

+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ('http', 'https', 's3')
+
+def hf_bucket_url(identifier, postfix=None, cdn=False):
+    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
+    if postfix is None:
+        return "/".join((endpoint, identifier))
+    else:
+        return "/".join((endpoint, identifier, postfix))
+
+
 def url_to_filename(url, etag=None):
    """
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
-    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
+    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
    so that TF 2.0 can identify it as a HDF5 file
    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    """
@@ -152,7 +186,7 @@ def filename_to_url(filename, cache_dir=None):
    return url, etag


-def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None):
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
@@ -161,6 +195,8 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
    Args:
        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletly recieved file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
@@ -169,15 +205,15 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
+    if is_remote_url(url_or_filename):
        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir,
+            force_download=force_download, proxies=proxies,
+            resume_download=resume_download, user_agent=user_agent)
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
-    elif parsed.scheme == '':
+    elif urlparse(url_or_filename).scheme == '':
        # File, but it doesn't exist.
        raise EnvironmentError("file {} not found".format(url_or_filename))
    else:
@@ -234,19 +270,34 @@ def s3_get(url, temp_file, proxies=None):
    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)


-def http_get(url, temp_file, proxies=None):
-    req = requests.get(url, stream=True, proxies=proxies)
-    content_length = req.headers.get('Content-Length')
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total)
-    for chunk in req.iter_content(chunk_size=1024):
+def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
+    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(
+            "{}/{}".format(k, v) for k, v in user_agent.items()
+        )
+    elif isinstance(user_agent, six.string_types):
+        ua += "; "+ user_agent
+    headers = {
+        "user-agent": ua
+    }
+    if resume_size > 0:
+        headers['Range'] = 'bytes=%d-' % (resume_size,)
+    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get('Content-Length')
+    total = resume_size + int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size,
+                    desc="Downloading", disable=bool(logger.level<=logging.INFO))
+    for chunk in response.iter_content(chunk_size=1024):
        if chunk: # filter out keep-alive new chunks
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()


-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
@@ -284,41 +335,60 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
    # If we don't have a connection (etag is None) and can't identify the file
    # try to get the last downloaded one
    if not os.path.exists(cache_path) and etag is None:
-        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
-        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        matching_files = [
+            file
+            for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+            if not file.endswith('.json') and not file.endswith('.lock')
+        ]
        if matching_files:
            cache_path = os.path.join(cache_dir, matching_files[-1])

-    if not os.path.exists(cache_path) or force_download:
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                s3_get(url, temp_file, proxies=proxies)
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + '.lock'
+    with FileLock(lock_path):
+
+        if resume_download:
+            incomplete_path = cache_path + '.incomplete'
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path,'a+b') as f:
+                    yield f
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
            else:
-                http_get(url, temp_file, proxies=proxies)
-
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
-
-            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-            with open(cache_path, 'wb') as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
-
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
-            with open(meta_path, 'w') as meta_file:
-                output_string = json.dumps(meta)
-                if sys.version_info[0] == 2 and isinstance(output_string, str):
-                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
-                meta_file.write(output_string)
-
-            logger.info("removing temp file %s", temp_file.name)
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            resume_size = 0
+
+        if etag is not None and (not os.path.exists(cache_path) or force_download):
+            # Download to temporary file, then copy to cache dir once finished.
+            # Otherwise you get corrupt cache entries if the download gets interrupted.
+            with temp_file_manager() as temp_file:
+                logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+
+                # GET file object
+                if url.startswith("s3://"):
+                    if resume_download:
+                        logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
+                    s3_get(url, temp_file, proxies=proxies)
+                else:
+                    http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
+
+                # we are copying the file before closing it, so flush to avoid truncation
+                temp_file.flush()
+
+                logger.info("storing %s in cache at %s", url, cache_path)
+                os.rename(temp_file.name, cache_path)
+
+                logger.info("creating metadata file for %s", cache_path)
+                meta = {'url': url, 'etag': etag}
+                meta_path = cache_path + '.json'
+                with open(meta_path, 'w') as meta_file:
+                    output_string = json.dumps(meta)
+                    if sys.version_info[0] == 2 and isinstance(output_string, str):
+                        output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                    meta_file.write(output_string)

    return cache_path
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import os
+from os.path import expanduser
+
+import requests
+import six
+from requests.exceptions import HTTPError
+from tqdm import tqdm
+
+ENDPOINT = "https://huggingface.co"
+
+class S3Obj:
+    def __init__(
+        self,
+        filename,     # type: str
+        LastModified, # type: str
+        ETag,         # type: str
+        Size,         # type: int
+        **kwargs
+    ):
+        self.filename = filename
+        self.LastModified = LastModified
+        self.ETag = ETag
+        self.Size = Size
+
+
+class PresignedUrl:
+    def __init__(
+        self,
+        write,  # type: str
+        access, # type: str
+        type,   # type: str
+        **kwargs
+    ):
+        self.write = write
+        self.access = access
+        self.type = type # mime-type to send to S3.
+
+
+class HfApi:
+    def __init__(self, endpoint=None):
+        self.endpoint = endpoint if endpoint is not None else ENDPOINT
+
+    def login(
+        self,
+        username, # type: str
+        password, # type: str
+    ):
+        # type: (...) -> str
+        """
+        Call HF API to sign in a user and get a token if credentials are valid.
+
+        Outputs:
+            token if credentials are valid
+
+        Throws:
+            requests.exceptions.HTTPError if credentials are invalid
+        """
+        path = "{}/api/login".format(self.endpoint)
+        r = requests.post(path, json={"username": username, "password": password})
+        r.raise_for_status()
+        d = r.json()
+        return d["token"]
+
+    def whoami(
+        self,
+        token, # type: str
+    ):
+        # type: (...) -> str
+        """
+        Call HF API to know "whoami"
+        """
+        path = "{}/api/whoami".format(self.endpoint)
+        r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
+        r.raise_for_status()
+        d = r.json()
+        return d["user"]
+
+    def logout(self, token):
+        # type: (...) -> void
+        """
+        Call HF API to log out.
+        """
+        path = "{}/api/logout".format(self.endpoint)
+        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
+        r.raise_for_status()
+
+    def presign(self, token, filename):
+        # type: (...) -> PresignedUrl
+        """
+        Call HF API to get a presigned url to upload `filename` to S3.
+        """
+        path = "{}/api/presign".format(self.endpoint)
+        r = requests.post(
+            path,
+            headers={"authorization": "Bearer {}".format(token)},
+            json={"filename": filename},
+        )
+        r.raise_for_status()
+        d = r.json()
+        return PresignedUrl(**d)
+
+    def presign_and_upload(self, token, filename, filepath):
+        # type: (...) -> str
+        """
+        Get a presigned url, then upload file to S3.
+
+        Outputs:
+            url: Read-only url for the stored file on S3.
+        """
+        urls = self.presign(token, filename=filename)
+        # streaming upload:
+        # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
+        # 
+        # Even though we presign with the correct content-type,
+        # the client still has to specify it when uploading the file.
+        with open(filepath, "rb") as f:
+            pf = TqdmProgressFileReader(f)
+            data = f if pf.total_size > 0 else ""
+
+            r = requests.put(urls.write, data=data, headers={
+                "content-type": urls.type,
+            })
+            r.raise_for_status()
+            pf.close()
+        return urls.access
+
+    def list_objs(self, token):
+        # type: (...) -> List[S3Obj]
+        """
+        Call HF API to list all stored files for user.
+        """
+        path = "{}/api/listObjs".format(self.endpoint)
+        r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
+        r.raise_for_status()
+        d = r.json()
+        return [S3Obj(**x) for x in d]
+
+
+
+class TqdmProgressFileReader:
+    """
+    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
+    and override `f.read()` so as to display a tqdm progress bar.
+
+    see github.com/huggingface/transformers/pull/2078#discussion_r354739608
+    for implementation details.
+    """
+    def __init__(
+        self,
+        f   # type: io.BufferedReader
+    ):
+        self.f = f
+        self.total_size = os.fstat(f.fileno()).st_size # type: int
+        self.pbar = tqdm(total=self.total_size, leave=False)
+        if six.PY3:
+            # does not work unless PY3
+            # no big deal as the CLI does not currently support PY2 anyways.
+            self.read = f.read
+            f.read = self._read
+
+    def _read(self, n=-1):
+        self.pbar.update(n)
+        return self.read(n)
+
+    def close(self):
+        self.pbar.close()
+
+
+
+class HfFolder:
+    path_token = expanduser("~/.huggingface/token")
+
+    @classmethod
+    def save_token(cls, token):
+        """
+        Save token, creating folder as needed.
+        """
+        if six.PY3:
+            os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
+        else:
+            # Python 2
+            try:
+                os.makedirs(os.path.dirname(cls.path_token))
+            except OSError as e:
+                if e.errno != os.errno.EEXIST:
+                    raise e
+                pass
+        with open(cls.path_token, 'w+') as f:
+            f.write(token)
+
+    @classmethod
+    def get_token(cls):
+        """
+        Get token or None if not existent.
+        """
+        try:
+            with open(cls.path_token, 'r') as f:
+                return f.read()
+        except:
+            # this is too wide. When Py2 is dead use:
+            # `except FileNotFoundError:` instead
+            return None
+
+    @classmethod
+    def delete_token(cls):
+        """
+        Delete token.
+        Do not fail if token does not exist.
+        """
+        try:
+            os.remove(cls.path_token)
+        except:
+            return
--- a/transformers/modelcard.py
+++ b/transformers/modelcard.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
+                        cached_path, is_remote_url, hf_bucket_url
+
+
+logger = logging.getLogger(__name__)
+
+
+class ModelCard(object):
+    r""" Model Card class.
+        Store model card as well as methods for loading/downloading/saving model cards.
+
+        Please read the following paper for details and explanation on the sections:
+            "Model Cards for Model Reporting"
+                by Margaret Mitchell, Simone Wu,
+                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+            Link: https://arxiv.org/abs/1810.03993
+
+        Note:
+            A model card can be loaded and saved to disk.
+
+        Parameters:
+    """
+    def __init__(self, **kwargs):
+        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        self.model_details = kwargs.pop('model_details', {})
+        self.intended_use = kwargs.pop('intended_use', {})
+        self.factors = kwargs.pop('factors', {})
+        self.metrics = kwargs.pop('metrics', {})
+        self.evaluation_data = kwargs.pop('evaluation_data', {})
+        self.training_data = kwargs.pop('training_data', {})
+        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
+        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
+        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+
+        # Open additional attributes
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    def save_pretrained(self, save_directory_or_file):
+        """ Save a model card object to the directory or file `save_directory_or_file`.
+        """
+        if os.path.isdir(save_directory_or_file):
+            # If we save using the predefined names, we can load using `from_pretrained`
+            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
+        else:
+            output_model_card_file = save_directory_or_file
+
+        self.to_json_file(output_model_card_file)
+        logger.info("Model card saved in {}".format(output_model_card_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                card should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            find_from_standard_name: (`optional`) boolean, default True:
+                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename.
+                Can be used to directly feed a model/config url and access the colocated modelcard.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final model card object.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+
+        Examples::
+
+            modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
+            modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
+            modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
+            modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        proxies = kwargs.pop('proxies', None)
+        find_from_standard_name = kwargs.pop('find_from_standard_name', True)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            # For simplicity we use the same pretrained url than the configuration files
+            # but with a different suffix (modelcard.json). This suffix is replaced below.
+            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            model_card_file = pretrained_model_name_or_path
+        else:
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
+
+        if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
+            model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME)
+            model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME)
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
+                                               proxies=proxies, resume_download=False)
+            if resolved_model_card_file == model_card_file:
+                logger.info("loading model card file {}".format(model_card_file))
+            else:
+                logger.info("loading model card file {} from cache at {}".format(
+                    model_card_file, resolved_model_card_file))
+            # Load model card
+            modelcard = cls.from_json_file(resolved_model_card_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
+                        model_card_file))
+            else:
+                logger.warning("Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file, MODEL_CARD_NAME))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            modelcard = cls()
+
+        except json.JSONDecodeError:
+            logger.warning("Couldn't reach server at '{}' to download model card file or "
+                           "model card file is not a valid JSON file. "
+                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            modelcard = cls()
+
+        # Update model card with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(modelcard, key):
+                setattr(modelcard, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model card: %s", str(modelcard))
+        if return_unused_kwargs:
+            return modelcard, kwargs
+        else:
+            return modelcard
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelCard` from a Python dictionary of parameters."""
+        return cls(**json_object)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelCard` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
+
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ALBERT model. """
+
+import os
+import math
+import logging
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_albert import AlbertConfig
+from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+
+ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
+    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
+    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
+    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
+    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
+    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
+    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
+    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
+}
+
+
+def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        print(name)
+    
+    for name, array in zip(names, arrays):
+        original_name = name
+
+        # If saved from the TF HUB module
+        name = name.replace("module/", "")
+
+        # Renaming and simplifying
+        name = name.replace("ffn_1", "ffn")
+        name = name.replace("bert/", "albert/")
+        name = name.replace("attention_1", "attention")   
+        name = name.replace("transform/", "")
+        name = name.replace("LayerNorm_1", "full_layer_layer_norm")    
+        name = name.replace("LayerNorm", "attention/LayerNorm")   
+        name = name.replace("transformer/", "")
+
+        # The feed forward layer had an 'intermediate' step which has been abstracted away
+        name = name.replace("intermediate/dense/", "")
+        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
+
+        # ALBERT attention was split between self and output which have been abstracted away
+        name = name.replace("/output/", "/")
+        name = name.replace("/self/", "/")
+
+        # The pooler is a linear layer
+        name = name.replace("pooler/dense", "pooler")
+
+        # The classifier was simplified to predictions from cls/predictions
+        name = name.replace("cls/predictions", "predictions")
+        name = name.replace("predictions/attention", "predictions")
+
+        # Naming was changed to be more explicit
+        name = name.replace("embeddings/attention", "embeddings")    
+        name = name.replace("inner_group_", "albert_layers/") 
+        name = name.replace("group_", "albert_layer_groups/")   
+
+        # Classifier
+        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
+            name = "classifier/" + name
+
+        # No ALBERT model currently handles the next sentence prediction task 
+        if "seq_relationship" in name:
+            continue
+
+        name = name.split('/')
+
+        # Ignore the gradients applied by the LAMB/ADAM optimizers.
+        if "adam_m" in name or "adam_v" in name or "global_step" in name:
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {} from {}".format(name, original_name))
+        pointer.data = torch.from_numpy(array)
+
+    return model
+
+
+class AlbertEmbeddings(BertEmbeddings):
+    """
+    Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(AlbertEmbeddings, self).__init__(config)
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+        self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+
+
+class AlbertAttention(BertSelfAttention):
+    def __init__(self, config):
+        super(AlbertAttention, self).__init__(config)
+
+        self.output_attentions = config.output_attentions
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size 
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.num_attention_heads, self.attention_head_size)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
+        for head in heads:
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+
+        # Prune linear layers
+        self.query = prune_linear_layer(self.query, index)
+        self.key = prune_linear_layer(self.key, index)
+        self.value = prune_linear_layer(self.value, index)
+        self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.num_attention_heads = self.num_attention_heads - len(heads)
+        self.all_head_size = self.attention_head_size * self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, input_ids, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(input_ids)
+        mixed_key_layer = self.key(input_ids)
+        mixed_value_layer = self.value(input_ids)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        reshaped_context_layer = context_layer.view(*new_context_layer_shape)
+        
+
+        # Should find a better way to do this
+        w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
+        b = self.dense.bias.to(context_layer.dtype)
+
+        projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
+        projected_context_layer_dropout = self.dropout(projected_context_layer)
+        layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout)
+        return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,)
+
+
+class AlbertLayer(nn.Module):
+    def __init__(self, config):
+        super(AlbertLayer, self).__init__()
+        
+        self.config = config
+        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = AlbertAttention(config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) 
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        ffn_output = self.ffn(attention_output[0])
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
+
+        return (hidden_states,) + attention_output[1:]  # add attentions if we output them
+
+
+class AlbertLayerGroup(nn.Module):
+    def __init__(self, config):
+        super(AlbertLayerGroup, self).__init__()
+        
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        layer_hidden_states = ()
+        layer_attentions = ()
+
+        for layer_index, albert_layer in enumerate(self.albert_layers):
+            layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index])
+            hidden_states = layer_output[0]
+
+            if self.output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+
+            if self.output_hidden_states:
+                layer_hidden_states = layer_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (layer_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (layer_attentions,)
+        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)
+
+
+class AlbertTransformer(nn.Module):
+    def __init__(self, config):
+        super(AlbertTransformer, self).__init__()
+        
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
+        self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+
+        all_attentions = ()
+
+        if self.output_hidden_states:
+            all_hidden_states = (hidden_states,)
+
+        for i in range(self.config.num_hidden_layers):
+            # Number of layers in a hidden group
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
+
+            # Index of the hidden group
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+
+            # Index of the layer inside the group
+            layer_idx = int(i - group_idx * layers_per_group)
+            
+            layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])  
+            hidden_states = layer_group_output[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+
+class AlbertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = AlbertConfig
+    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "albert"
+
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ALBERT_START_DOCSTRING = r"""    The ALBERT model was proposed in
+    `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
+    by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
+    two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
+        https://arxiv.org/abs/1909.11942
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ALBERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
+                      ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+class AlbertModel(AlbertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    """
+
+    config_class = AlbertConfig
+    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_albert
+    base_model_prefix = "albert"
+
+    def __init__(self, config):
+        super(AlbertModel, self).__init__(config)
+
+        self.config = config
+        self.embeddings = AlbertEmbeddings(config)
+        self.encoder = AlbertTransformer(config)
+        self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+        self.pooler_activation = nn.Tanh()
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
+            If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
+            is a total of 4 different layers.
+
+            These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
+            while [2,3] correspond to the two inner groups of the second hidden layer.
+
+            Any layer with in index other than [0,1,2,3] will result in an error.
+            See base class PreTrainedModel for more information about head pruning
+        """
+        for layer, heads in heads_to_prune.items():
+            group_idx = int(layer / self.config.inner_group_num)
+            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
+            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
+
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None):
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                           inputs_embeds=inputs_embeds)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
+
+        outputs = (sequence_output, pooled_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs
+
+class AlbertMLMHead(nn.Module):
+    def __init__(self, config):
+        super(AlbertMLMHead, self).__init__()
+
+        self.LayerNorm = nn.LayerNorm(config.embedding_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+
+        prediction_scores = hidden_states + self.bias
+
+        return prediction_scores
+
+
+@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+class AlbertForMaskedLM(AlbertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    """
+
+    def __init__(self, config):
+        super(AlbertForMaskedLM, self).__init__(config)
+
+        self.albert = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.predictions.decoder,
+                                   self.albert.embeddings.word_embeddings)
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
+                masked_lm_labels=None):
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds
+        )
+        sequence_outputs = outputs[0]
+
+        prediction_scores = self.predictions(sequence_outputs)
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+class AlbertForSequenceClassification(AlbertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(AlbertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
+
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+
+@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
+class AlbertForQuestionAnswering(AlbertPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
+        input_ids = tokenizer.encode(input_text)
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
+        # a nice puppet
+
+
+    """
+    def __init__(self, config):
+        super(AlbertForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None, start_positions=None, end_positions=None):
+
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,15 +18,31 @@ from __future__ import absolute_import, division, print_function, unicode_litera

 import logging

-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
+                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
+                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
+
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
+    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
+    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
+    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
+    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
+    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
+    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
+    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_utils import PreTrainedModel, SequenceSummary

@@ -35,33 +51,105 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)


+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoModel(object):
    r"""
        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+        or the `AutoModel.from_config(config)` class methods.

        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.

        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `albert`: AlbertModel (ALBERT model)
+            - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
            - contains `roberta`: RobertaModel (RoBERTa model)
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)

        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("AutoModel is designed to be instantiated "
-            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModel.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return DistilBertModel(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaModel(config)
+        elif isinstance(config, BertConfig):
+            return BertModel(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return OpenAIGPTModel(config)
+        elif isinstance(config, GPT2Config):
+            return GPT2Model(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TransfoXLModel(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetModel(config)
+        elif isinstance(config, XLMConfig):
+            return XLMModel(config)
+        elif isinstance(config, CTRLConfig):
+            return CTRLModel(config)
+        elif isinstance(config, AlbertConfig):
+            return AlbertModel(config)
+        elif isinstance(config, CamembertConfig):
+            return CamembertModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaModel(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -70,15 +158,19 @@ class AutoModel(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `albert`: AlbertModel (ALBERT model)
+            - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
            - contains `roberta`: RobertaModel (RoBERTa model)
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL model)

            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
            To train the model, you should first set it back in training mode with `model.train()`
@@ -87,6 +179,7 @@ class AutoModel(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -112,6 +205,9 @@ class AutoModel(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -136,8 +232,16 @@ class AutoModel(object):
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'camembert' in pretrained_model_name_or_path:
+            return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -156,7 +260,7 @@ class AutoModel(object):
            return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))


 class AutoModelWithLMHead(object):
@@ -171,21 +275,70 @@ class AutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `albert`: AlbertForMaskedLM (ALBERT model)
+            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
            - contains `bert`: BertForMaskedLM (Bert model)
            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
-            - contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+            - contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)

        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelWithLMHead.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return DistilBertForMaskedLM(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForMaskedLM(config)
+        elif isinstance(config, BertConfig):
+            return BertForMaskedLM(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return OpenAIGPTLMHeadModel(config)
+        elif isinstance(config, GPT2Config):
+            return GPT2LMHeadModel(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TransfoXLLMHeadModel(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetLMHeadModel(config)
+        elif isinstance(config, XLMConfig):
+            return XLMWithLMHeadModel(config)
+        elif isinstance(config, CTRLConfig):
+            return CTRLLMHeadModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForMaskedLM(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -197,7 +350,11 @@ class AutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `albert`: AlbertForMaskedLM (ALBERT model)
+            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
            - contains `bert`: BertForMaskedLM (Bert model)
            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -205,6 +362,7 @@ class AutoModelWithLMHead(object):
            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+            - contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
@@ -213,6 +371,7 @@ class AutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -237,6 +396,8 @@ class AutoModelWithLMHead(object):

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
@@ -262,8 +423,16 @@ class AutoModelWithLMHead(object):
            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'camembert' in pretrained_model_name_or_path:
+            return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -282,7 +451,7 @@ class AutoModelWithLMHead(object):
            return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))


 class AutoModelForSequenceClassification(object):
@@ -298,6 +467,9 @@ class AutoModelForSequenceClassification(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `albert`: AlbertForSequenceClassification (ALBERT model)
+            - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: BertForSequenceClassification (Bert model)
            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -306,8 +478,45 @@ class AutoModelForSequenceClassification(object):
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
+            "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForSequenceClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, AlbertConfig):
+            return AlbertForSequenceClassification(config)
+        elif isinstance(config, CamembertConfig):
+            return CamembertForSequenceClassification(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForSequenceClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForSequenceClassification(config)
+        elif isinstance(config, BertConfig):
+            return BertForSequenceClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForSequenceClassification(config)
+        elif isinstance(config, XLMConfig):
+            return XLMForSequenceClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForSequenceClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -320,6 +529,9 @@ class AutoModelForSequenceClassification(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `albert`: AlbertForSequenceClassification (ALBERT model)
+            - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: BertForSequenceClassification (Bert model)
            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -332,6 +544,7 @@ class AutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -357,6 +570,9 @@ class AutoModelForSequenceClassification(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -383,6 +599,12 @@ class AutoModelForSequenceClassification(object):
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'camembert' in pretrained_model_name_or_path:
+            return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -393,7 +615,7 @@ class AutoModelForSequenceClassification(object):
            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))


 class AutoModelForQuestionAnswering(object):
@@ -409,6 +631,7 @@ class AutoModelForQuestionAnswering(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `albert`: AlbertForQuestionAnswering (ALBERT model)
            - contains `bert`: BertForQuestionAnswering (Bert model)
            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: XLMForQuestionAnswering (XLM model)
@@ -416,8 +639,38 @@ class AutoModelForQuestionAnswering(object):
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
+            "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForQuestionAnswering.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, AlbertConfig):
+            return AlbertForQuestionAnswering(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForQuestionAnswering(config)
+        elif isinstance(config, BertConfig):
+            return BertForQuestionAnswering(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForQuestionAnswering(config)
+        elif isinstance(config, XLMConfig):
+            return XLMForQuestionAnswering(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -430,6 +683,7 @@ class AutoModelForQuestionAnswering(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `albert`: AlbertForQuestionAnswering (ALBERT model)
            - contains `bert`: BertForQuestionAnswering (Bert model)
            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: XLMForQuestionAnswering (XLM model)
@@ -441,6 +695,7 @@ class AutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -492,6 +747,8 @@ class AutoModelForQuestionAnswering(object):
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
@@ -500,4 +757,131 @@ class AutoModelForQuestionAnswering(object):
            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForTokenClassification:
+    def __init__(self):
+        raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
+                               "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+    
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `camembert` configuration class: CamembertModel (Camembert model)
+                    - isInstance of `roberta` configuration class: RobertaModel (Roberta model)
+
+        Examples::
+    
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, CamembertConfig):
+            return CamembertForTokenClassification(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForTokenClassification(config)
+        elif isinstance(config, BertConfig):
+            return BertForTokenClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForTokenClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForTokenClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForTokenClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForTokenClassification (DistilBERT model)
+            - contains `camembert`: CamembertForTokenClassification (Camembert model)
+            - contains `bert`: BertForTokenClassification (Bert model)
+            - contains `xlnet`: XLNetForTokenClassification (XLNet model)
+            - contains `roberta`: RobertaForTokenClassification (Roberta model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'camembert' in pretrained_model_name_or_path:
+            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_beam_search.py
+++ b/transformers/modeling_beam_search.py
-# coding=utf-8
-# Copyright (c) 2019 Yang Liu
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""
-A general wrapper around models with LM heads to generate sequences
-using beam search.
-"""
-import torch
-from torch import nn
-
-
-class TransformerBeamSearch(nn.Module):
-    def __init__(
-        self,
-        model,
-        tokenizer,
-        batch_size,
-        beam_size,
-        min_length,
-        max_length,
-        alpha=0,
-        block_repeating_trigram=True,
-    ):
-        """
-        Attributes:
-            mask_word_id: token id that corresponds to the mask
-        """
-        super(TransformerBeamSearch, self).__init__()
-        self.model = model
-        self.tokenizer = tokenizer
-
-        self.start_token_id = tokenizer.start_token_id
-        self.end_token_id = tokenizer.end_token_id
-        self.pad_token_id = tokenizer.pad_token_id
-
-        self.beam_size = beam_size
-        self.min_length = min_length
-        self.max_length = max_length
-
-        self.block_repeating_trigram = block_repeating_trigram
-        self.apply_length_penalty = False if alpha == 0 else True
-        self.alpha = alpha
-
-        # State of the beam
-        self.hypotheses = [[] for _ in range(batch_size)]
-        self.batch_offset = torch.arange(batch_size, dtype=torch.long)
-        self.beam_offset = torch.arange(
-            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
-        )
-        self.growing_beam = torch.full(
-            (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
-        )
-        self.topk_log_probabilities = torch.tensor(
-            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
-        ).repeat(batch_size)
-        self.results = {
-            "prediction": [[] for _ in batch_size],
-            "scores": [[] for _ in batch_size],
-        }
-        self._step = 0
-        self.is_done = False
-
-    def step(self, log_probabilities):
-        """ Grows the beam by one step. """
-        self._step += 1
-
-        # The batch size changes as some beams finish so we define _B
-        vocab_size = log_probabilities.size(-1)
-        _B = log_probabilities.size(0) // self.beam_size
-
-        # Multiply each beam probability with the probability of the
-        # next token (conditioned on the words in the beam).
-        log_probabilities += self.topk_log_probabilities.view(-1, 1)
-
-        self.enforce_min_length(log_probabilities)
-        if self.block_repeating_trigram:
-            self.remove_repeating_trigrams(log_probabilities, _B)
-
-        # Find the `beam_size` (previous_beam + token) combinations with
-        # the highest score
-        topk_log_probabilities, topk_ids = log_probabilities.topk(
-            log_probabilities.view(_B, self.beam_size * vocab_size),
-            self.beam_size,
-            dim=1,
-        )
-
-        # Apply the length penalty. The +1 accounts for the [EOS] token
-        # that will be added if the beam ends.
-        topk_scores = topk_log_probabilities / self.length_penalty()
-
-        # Retrieve the corresponding respective beam and token id
-        # topk_token_ids[i] will be added to topk_beam_ids[i]
-        topk_beam_ids = topk_ids.div(vocab_size)
-        topk_token_ids = topk_ids.fmod(vocab_size)
-
-        # Retrieve the row index of the surviving beams in the original
-        # view of the log_probabilities tensor
-        surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
-            -1
-        )
-
-        # Append the last predictions
-        self.growing_beam = torch.cat(
-            [
-                self.growing_beam.index_select(0, surviving_beams_rows),
-                topk_token_ids.view(-1, 1),
-            ],
-            1,
-        )
-
-        # Check if any of the beam searches has ended during this
-        # growth step. Also if top beam (most probable) has ended
-        # for one element of the batch.
-        is_finished = topk_token_ids.eq(self.end_token_id)
-        self.enforce_max_length()
-        is_top_beam_finished = is_finished[:, 0].eq(1)
-
-        # Save the finished searches
-        if is_finished.any():
-            predictions = self.growing_beam.view(
-                -1, self.beam_size, self.growing_beam.size(1)
-            )
-            for i in range(is_finished.size(0)):
-                if is_top_beam_finished[i]:
-                    is_finished[i].fill_(1)
-                finished_hyp = is_finished[i].nonzero().view(-1)
-
-                # Store finished hypotheses for this batch.
-                b = self.batch_offset[i]
-                for j in finished_hyp:
-                    self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
-
-                # If the batch reached the end, save the best hypotheses
-                # in terms of length-penalized score.
-                if is_top_beam_finished[i]:
-                    best_hyp = sorted(
-                        self.hypotheses[b], key=lambda x: x[0], reverse=True
-                    )
-                    best_score, best_prediction = best_hyp[0]
-                    self.results["scores"][b].append(best_score)
-                    self.results["predictions"][b].append(best_prediction)
-
-            non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
-            if len(non_finished) == 0:
-                self.is_done = True
-
-            # Remove finished batches for the next step.
-            topk_log_probabilities = topk_log_probabilities.index_select(
-                0, non_finished
-            )
-            self.batch_offset = self.batch_offset.index_select(0, non_finished)
-            self.growing_beam = predictions.index_select(0, non_finished).view(
-                -1, self.growing_beam.size(-1)
-            )
-
-            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
-
-        return surviving_beams_rows
-
-    def forward(self, encoder_input_ids, **kwargs):
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_encoder = {
-            argument[len("encoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("encoder_")
-        }
-        kwargs_decoder = {
-            argument[len("decoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("decoder_")
-        }
-        kwargs_common = {
-            argument: value
-            for argument, value in kwargs.items()
-            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
-        }
-        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
-        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
-
-        # forward pass on the encoder
-        encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
-        kwargs_decoder["encoder_hidden_states"] = tile(
-            encoder_outputs, self.beam_size, dim=0
-        )
-
-        # grow the beam by generating sequences in an autoregressive way
-        self.growing_beam = torch.full(
-            (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
-        )
-        for step in range(self.max_length):
-            decoder_input = self.growing_beam[:, -1]
-            outputs = self.model.decoder(decoder_input, kwargs_decoder)
-            log_probabilities = torch.nn.functional.log_softmax(outputs[1])
-            surviving_beams_rows = self.step(log_probabilities)
-            if self.is_done:
-                break
-
-            kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
-                "encoder_hidden_states"
-            ].index_select(0, surviving_beams_rows)
-
-        return self.results
-
-    def remove_repeating_trigrams(self, log_probabilities, _B):
-        if(self._step + 1 > 3):
-            for i in range(_B * self.beam_size):
-                tokens = [t for t in self.growing_beam[i]]
-                trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
-                last_trigram = tuple(trigrams[-1])
-                if last_trigram in trigrams[:-1]:
-                    log_probabilities[i] = -1e20
-
-    def enforce_min_length(self):
-        if self._step < self.min_length:
-            self.log_probabilities[self.end_token_id] = -1e20
-
-    def enforce_max_length(self):
-        if self._step + 1 == self.max_length:
-            self.is_finished.fill_(1)
-
-    def length_penalty(self):
-        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
-
-
-def tile(x, count, dim=0):
-    """
-    Tiles `x` along dimension `dim` `count` times.
-
-    Example:
-        >> ex = torch.tensor([1,2],[3,4])
-        >> tile(ex, 2, 0)
-        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
-    """
-    perm = list(range(len(x.size())))
-    if dim != 0:
-        perm[0], perm[dim] = perm[dim], perm[0]
-        x = x.permute(perm).contiguous()
-    out_size = list(x.size())
-    out_size[0] *= count
-    batch = x.size(0)
-    x = (
-        x.view(batch, -1)
-        .transpose(0, 1)
-        .repeat(count, 1)
-        .transpose(0, 1)
-        .contiguous()
-        .view(*out_size)
-    )
-    if dim != 0:
-        x = x.permute(perm).contiguous()
-    return x