Merge branch 'master' into t5

0558c9cb · thomwolf · 608a8f5b · e57d00ee · 0558c9cb · 0558c9cb
Commit 0558c9cb authored Dec 10, 2019 by thomwolf
20 changed files
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 import unittest
 import shutil
-import pytest
 from transformers import is_torch_available
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 if is_torch_available():
    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -31,10 +31,9 @@ if is_torch_available():
                                        XxxForQuestionAnswering, XxxForSequenceClassification,
                                        XxxForTokenClassification, XxxForMultipleChoice)
    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
+@require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                   start_positions=sequence_labels, end_positions=sequence_labels)
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -172,7 +172,7 @@ class XxxTokenizer(PreTrainedTokenizer):
                special tokens for the model
        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:

--- a/transformers-cli
+++ b/transformers-cli
+#!/usr/bin/env python
+from argparse import ArgumentParser
+from transformers.commands.user import UserCommands
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli <command> [<args>]')
+    commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
+    # Register commands
+    UserCommands.register_subcommand(commands_parser)
+    # Let's go
+    args = parser.parse_args()
+    if not hasattr(args, 'func'):
+        parser.print_help()
+        exit(1)
+    # Run
+    service = args.func(args)
+    service.run()
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
-__version__ = "2.1.1"
+__version__ = "2.2.1"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -25,10 +25,13 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
 from .data import (is_sklearn_available,
                   InputExample, InputFeatures, DataProcessor,
                   glue_output_modes, glue_convert_examples_to_features,
-                   glue_processors, glue_tasks_num_labels)
+                   glue_processors, glue_tasks_num_labels,
+                   xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
+                   squad_convert_examples_to_features, SquadFeatures, 
+                   SquadExample, SquadV1Processor, SquadV2Processor)
 if is_sklearn_available():
-    from .data import glue_compute_metrics
+    from .data import glue_compute_metrics, xnli_compute_metrics
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
@@ -42,6 +45,8 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_albert import AlbertTokenizer
+from .tokenization_camembert import CamembertTokenizer
 from .tokenization_t5 import T5Tokenizer
 # Configurations
@@ -56,6 +61,8 @@ from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MA
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
@@ -73,7 +80,8 @@ if is_torch_available():
                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                      load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                    AdaptiveEmbedding,
+                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -81,9 +89,10 @@ if is_torch_available():
                                CTRLLMHeadModel,
                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                                XLNetForSequenceClassification, XLNetForMultipleChoice,
+                                XLNetForSequenceClassification, XLNetForTokenClassification,
-                                XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
+                                XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
-                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
+                                XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
@@ -92,22 +101,31 @@ if is_torch_available():
                                RobertaForSequenceClassification, RobertaForMultipleChoice,
                                RobertaForTokenClassification,
                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
+    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+                                DistilBertForTokenClassification,
                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
+                                CamembertForSequenceClassification, CamembertForMultipleChoice,
+                                CamembertForTokenClassification,
+                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
                              load_tf_weights_in_t5,
                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
+                                AlbertForQuestionAnswering,
+                                load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    # Optimization
-    from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
-                               WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+                               get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
 # TensorFlow
 if is_tf_available():
-    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
+    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
                                   TFAutoModelWithLMHead)
@@ -133,6 +151,7 @@ if is_tf_available():
    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
                                    TFXLNetModel, TFXLNetLMHeadModel,
                                    TFXLNetForSequenceClassification,
+                                    TFXLNetForTokenClassification,
                                    TFXLNetForQuestionAnsweringSimple,
                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -151,6 +170,7 @@ if is_tf_available():
    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                         TFDistilBertModel, TFDistilBertForMaskedLM,
                                         TFDistilBertForSequenceClassification,
+                                         TFDistilBertForTokenClassification,
                                         TFDistilBertForQuestionAnswering,
                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -158,9 +178,16 @@ if is_tf_available():
                                    TFCTRLLMHeadModel,
                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
+                                     TFAlbertForSequenceClassification,
+                                    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    # Optimization
+    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
 # TF 2.0 <=> PyTorch conversion utilities
 from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
                                        load_pytorch_checkpoint_in_tf2_model,

--- a/transformers/commands/__init__.py
+++ b/transformers/commands/__init__.py
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+class BaseTransformersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
+from argparse import ArgumentParser
+from getpass import getpass
+import os
+from transformers.commands import BaseTransformersCLICommand
+from transformers.hf_api import HfApi, HfFolder, HTTPError
+class UserCommands(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        login_parser = parser.add_parser('login')
+        login_parser.set_defaults(func=lambda args: LoginCommand(args))
+        whoami_parser = parser.add_parser('whoami')
+        whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
+        logout_parser = parser.add_parser('logout')
+        logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
+        list_parser = parser.add_parser('ls')
+        list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
+        # upload
+        upload_parser = parser.add_parser('upload')
+        upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
+        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
+        upload_parser.set_defaults(func=lambda args: UploadCommand(args))
+class ANSI:
+    """
+    Helper for en.wikipedia.org/wiki/ANSI_escape_code
+    """
+    _bold = u"\u001b[1m"
+    _reset = u"\u001b[0m"
+    @classmethod
+    def bold(cls, s):
+        return "{}{}{}".format(cls._bold, s, cls._reset)
+class BaseUserCommand:
+    def __init__(self, args):
+        self.args = args
+        self._api = HfApi()
+class LoginCommand(BaseUserCommand):
+    def run(self):
+        print("""
+        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|  
+        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|        
+        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|    
+        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|        
+        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|  
+        """)
+        username = input("Username: ")
+        password = getpass()
+        try:
+            token = self._api.login(username, password)
+        except HTTPError as e:
+            # probably invalid credentials, display error message.
+            print(e)
+            exit(1)
+        HfFolder.save_token(token)
+        print("Login successful")
+        print("Your token:", token, "\n")
+        print("Your token has been saved to", HfFolder.path_token)
+class WhoamiCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        try:
+            user = self._api.whoami(token)
+            print(user)
+        except HTTPError as e:
+            print(e)
+class LogoutCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        HfFolder.delete_token()
+        self._api.logout(token)
+        print("Successfully logged out.")
+class ListObjsCommand(BaseUserCommand):
+    def tabulate(self, rows, headers):
+        # type: (List[List[Union[str, int]]], List[str]) -> str
+        """
+        Inspired by:
+        stackoverflow.com/a/8356620/593036
+        stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
+        """
+        col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
+        row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
+        lines = []
+        lines.append(
+            row_format.format(*headers)
+        )
+        lines.append(
+            row_format.format(*["-" * w for w in col_widths])
+        )
+        for row in rows:
+            lines.append(
+                row_format.format(*row)
+            )
+        return "\n".join(lines)
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        try:
+            objs = self._api.list_objs(token)
+        except HTTPError as e:
+            print(e)
+            exit(1)
+        if len(objs) == 0:
+            print("No shared file yet")
+            exit()
+        rows = [ [
+            obj.filename,
+            obj.LastModified,
+            obj.ETag,
+            obj.Size
+        ] for obj in objs ]
+        print(
+            self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
+        )
+class UploadCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        filepath = os.path.join(os.getcwd(), self.args.file)
+        filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
+        print(
+            "About to upload file {} to S3 under filename {}".format(
+                ANSI.bold(filepath), ANSI.bold(filename)
+            )
+        )
+        choice = input("Proceed? [Y/n] ").lower()
+        if not(choice == "" or choice == "y" or choice == "yes"):
+            print("Abort")
+            exit()
+        print(
+            ANSI.bold("Uploading... This might take a while if file is large")
+        )
+        access_url = self._api.presign_and_upload(
+            token=token, filename=filename, filepath=filepath
+        )
+        print("Your file now lives at:")
+        print(access_url)
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ALBERT model configuration """
+from .configuration_utils import PretrainedConfig
+ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
+    'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
+    'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
+    'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
+    'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
+    'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
+    'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
+    'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
+}
+class AlbertConfig(PretrainedConfig):
+    """Configuration for `AlbertModel`.
+    The default settings match the configuration of model `albert_xxlarge`.
+    """
+    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+    def __init__(self,
+                 vocab_size_or_config_json_file=30000,
+                 embedding_size=128,
+                 hidden_size=4096,
+                 num_hidden_layers=12,
+                 num_hidden_groups=1,
+                 num_attention_heads=64,
+                 intermediate_size=16384,
+                 inner_group_num=1,
+                 hidden_act="gelu_new",
+                 hidden_dropout_prob=0,
+                 attention_probs_dropout_prob=0,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12, **kwargs):
+        """Constructs AlbertConfig.
+        Args:
+            vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
+            embedding_size: size of voc embeddings.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_hidden_groups: Number of group for the hidden layers, parameters in
+                the same group are shared.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            inner_group_num: int, number of inner repetition of attention and ffn.
+            down_scale_factor: float, the scale to apply
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler.
+            hidden_dropout_prob: The dropout probability for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `AlbertModel`.
+            initializer_range: The stdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(AlbertConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size_or_config_json_file
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_hidden_groups = num_hidden_groups
+        self.num_attention_heads = num_attention_heads
+        self.inner_group_num = inner_group_num
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -27,6 +27,8 @@ from .configuration_xlm import XLMConfig
 from .configuration_roberta import RobertaConfig
 from .configuration_distilbert import DistilBertConfig
 from .configuration_ctrl import CTRLConfig
+from .configuration_camembert import CamembertConfig
+from .configuration_albert import AlbertConfig
 from .configuration_t5 import T5Config
 logger = logging.getLogger(__name__)
@@ -44,13 +46,15 @@ class AutoConfig(object):
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `albert`: AlbertConfig (ALBERT model)
+            - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
            - contains `xlnet`: XLNetConfig (XLNet model)
            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `ctrl` : CTRLConfig (CTRL model)
        This class cannot be instantiated using `__init__()` (throw an error).
    """
@@ -67,13 +71,15 @@ class AutoConfig(object):
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `t5`: T5Config (T5 model)
            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `albert`: AlbertConfig (ALBERT model)
+            - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
            - contains `xlnet`: XLNetConfig (XLNet model)
            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
            - contains `ctrl` : CTRLConfig (CTRL model)
        Params:
            pretrained_model_name_or_path: either:
@@ -94,6 +100,9 @@ class AutoConfig(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -120,6 +129,10 @@ class AutoConfig(object):
            return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'camembert' in pretrained_model_name_or_path:
+            return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -138,4 +151,4 @@ class AutoConfig(object):
            return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CamemBERT configuration """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+from .configuration_roberta import RobertaConfig
+logger = logging.getLogger(__name__)
+CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
+}
+class CamembertConfig(RobertaConfig):
+    pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
 DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
+    'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
+    'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
 }

--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+                                      "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
 class GPT2Config(PretrainedConfig):

--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -29,6 +29,8 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
+    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
+    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
 }

--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
        """
        cache_dir = kwargs.pop('cache_dir', None)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
            config_file = pretrained_model_name_or_path
        # redirect to the cache, if necessary
        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
+                                               proxies=proxies, resume_download=resume_download)
        except EnvironmentError:
            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(

--- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ALBERT checkpoint."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import torch
+from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
+import logging
+logging.basicConfig(level=logging.INFO)
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = AlbertConfig.from_json_file(albert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = AlbertForMaskedLM(config)
+    # Load weights from tf checkpoint
+    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--albert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained ALBERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.albert_config_file,
+                                     args.pytorch_dump_path)
\ No newline at end of file
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -34,6 +34,7 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
 if is_torch_available():
@@ -48,6 +49,7 @@ if is_torch_available():
                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -59,6 +61,7 @@ else:
    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
        None, None, None, None,
        None, None,
@@ -69,6 +72,7 @@ else:
        None, None, None,
        None, None, None,
        None, None,
+        None, None,
        None, None)
@@ -90,6 +94,7 @@ MODEL_CLASSES = {
    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }

--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
-from .processors import InputExample, InputFeatures, DataProcessor
+from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
+from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
 from .metrics import is_sklearn_available
 if is_sklearn_available():
-    from .metrics import glue_compute_metrics
+    from .metrics import glue_compute_metrics, xnli_compute_metrics
--- a/transformers/data/metrics/__init__.py
+++ b/transformers/data/metrics/__init__.py
@@ -81,3 +81,11 @@ if _has_sklearn:
            return {"acc": simple_accuracy(preds, labels)}
        else:
            raise KeyError(task_name)
+    def xnli_compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "xnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        else:
+            raise KeyError(task_name)
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
+""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
+modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Load SQuAD dataset. """
-from __future__ import absolute_import, division, print_function
 import json
 import logging
@@ -24,480 +14,371 @@ import math
 import collections
 from io import open
 from tqdm import tqdm
+import string
+import re
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
 logger = logging.getLogger(__name__)
-class SquadExample(object):
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def get_raw_scores(examples, preds):
    """
-    A single training/test example for the Squad dataset.
+    Computes the exact and f1 scores from the examples and the model predictions
-    For examples without an answer, the start and end position are -1.
    """
+    exact_scores = {}
+    f1_scores = {}
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = ['']
+        if qas_id not in preds:
+            print('Missing prediction for %s' % qas_id)
+            continue
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+    return exact_scores, f1_scores
-    def __init__(self,
-                 qas_id,
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-                 question_text,
+    new_scores = {}
-                 doc_tokens,
+    for qid, s in scores.items():
-                 orig_answer_text=None,
+        pred_na = na_probs[qid] > na_prob_thresh
-                 start_position=None,
+        if pred_na:
-                 end_position=None,
+            new_scores[qid] = float(not qid_to_has_ans[qid])
-                 is_impossible=None):
+        else:
-        self.qas_id = qas_id
+            new_scores[qid] = s
-        self.question_text = question_text
+    return new_scores
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-        self.end_position = end_position
+    if not qid_list:
-        self.is_impossible = is_impossible
+        total = len(exact_scores)
+        return collections.OrderedDict([
-    def __str__(self):
+            ('exact', 100.0 * sum(exact_scores.values()) / total),
-        return self.__repr__()
+            ('f1', 100.0 * sum(f1_scores.values()) / total),
+            ('total', total),
-    def __repr__(self):
+        ])
-        s = ""
+    else:
-        s += "qas_id: %s" % (self.qas_id)
+        total = len(qid_list)
-        s += ", question_text: %s" % (
+        return collections.OrderedDict([
-            self.question_text)
+            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        if self.start_position:
+            ('total', total),
-            s += ", start_position: %d" % (self.start_position)
+        ])
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
+def merge_eval(main_eval, new_eval, prefix):
-            s += ", is_impossible: %r" % (self.is_impossible)
+    for k in new_eval:
-        return s
+        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-class InputFeatures(object):
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-    """A single set of features of data."""
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
-    def __init__(self,
+    best_score = cur_score
-                 unique_id,
+    best_thresh = 0.0
-                 example_index,
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-                 doc_span_index,
+    for i, qid in enumerate(qid_list):
-                 tokens,
+        if qid not in scores:
-                 token_to_orig_map,
+            continue
-                 token_is_max_context,
+        if qid_to_has_ans[qid]:
-                 input_ids,
+            diff = scores[qid]
-                 input_mask,
+        else:
-                 segment_ids,
+            if preds[qid]:
-                 cls_index,
+                diff = -1
-                 p_mask,
-                 paragraph_len,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-        self.paragraph_len = paragraph_len
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'",
-                                           actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=0, pad_token_segment_id=0,
-                                 mask_padding_with_zero=True,
-                                 sequence_a_is_doc=False):
-    """Loads a data file into a list of `InputBatch`s."""
-    unique_id = 1000000000
-    # cnt_pos, cnt_neg = 0, 0
-    # max_N, max_M = 1024, 1024
-    # f = np.zeros((max_N, max_M), dtype=np.float32)
-    features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
-        # if example_index % 100 == 0:
-        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
-        query_tokens = tokenizer.tokenize(example.question_text)
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
            else:
-                tok_end_position = len(all_doc_tokens) - 1
+                diff = 0
-            (tok_start_position, tok_end_position) = _improve_answer_span(
+        cur_score += diff
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+        if cur_score > best_score:
-                example.orig_answer_text)
+            best_score = cur_score
+            best_thresh = na_probs[qid]
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
-        # We can have documents that are longer than the maximum sequence length.
+        if not qid_to_has_ans[qid]:
-        # To deal with this we do a sliding window approach, where we take chunks
+            continue
-        # of the up to our max length with a stride of `doc_stride`.
+        has_ans_cnt += 1
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
+        if qid not in scores:
-        doc_spans = []
+            continue
-        start_offset = 0
+        has_ans_score += scores[qid]
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-            if start_offset + length == len(all_doc_tokens):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
-                break
+        preds, exact_raw, na_probs, qid_to_has_ans)
-            start_offset += min(length, doc_stride)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
+        preds, f1_raw, na_probs, qid_to_has_ans)
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
+    main_eval['best_exact'] = best_exact
-            tokens = []
+    main_eval['best_exact_thresh'] = exact_thresh
-            token_to_orig_map = {}
+    main_eval['best_f1'] = best_f1
-            token_is_max_context = {}
+    main_eval['best_f1_thresh'] = f1_thresh
-            segment_ids = []
+    main_eval['has_ans_exact'] = has_ans_exact
+    main_eval['has_ans_f1'] = has_ans_f1
-            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = []
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-            # CLS token at the beginning
+    cur_score = num_no_ans
-            if not cls_token_at_end:
+    best_score = cur_score
-                tokens.append(cls_token)
+    best_thresh = 0.0
-                segment_ids.append(cls_token_segment_id)
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-                p_mask.append(0)
+    for _, qid in enumerate(qid_list):
-                cls_index = 0
+        if qid not in scores:
+            continue
-            # XLNet: P SEP Q SEP CLS
+        if qid_to_has_ans[qid]:
-            # Others: CLS Q SEP P SEP
+            diff = scores[qid]
-            if not sequence_a_is_doc:
+        else:
-                # Query
+            if preds[qid]:
-                tokens += query_tokens
+                diff = -1
-                segment_ids += [sequence_a_segment_id] * len(query_tokens)
+            else:
-                p_mask += [1] * len(query_tokens)
+                diff = 0
+        cur_score += diff
-                # SEP token
+        if cur_score > best_score:
-                tokens.append(sep_token)
+            best_score = cur_score
-                segment_ids.append(sequence_a_segment_id)
+            best_thresh = na_probs[qid]
-                p_mask.append(1)
+    return 100.0 * best_score / len(scores), best_thresh
-            # Paragraph
-            for i in range(doc_span.length):
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-                split_token_index = doc_span.start + i
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+    main_eval['best_exact'] = best_exact
-                                                       split_token_index)
+    main_eval['best_exact_thresh'] = exact_thresh
-                token_is_max_context[len(tokens)] = is_max_context
+    main_eval['best_f1'] = best_f1
-                tokens.append(all_doc_tokens[split_token_index])
+    main_eval['best_f1_thresh'] = f1_thresh
-                if not sequence_a_is_doc:
-                    segment_ids.append(sequence_b_segment_id)
-                else:
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
-                    segment_ids.append(sequence_a_segment_id)
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
-                p_mask.append(0)
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
-            paragraph_len = doc_span.length
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
-            if sequence_a_is_doc:
+    if no_answer_probs is None:
-                # SEP token
+        no_answer_probs = {k: 0.0 for k in preds}
-                tokens.append(sep_token)
-                segment_ids.append(sequence_a_segment_id)
+    exact, f1 = get_raw_scores(examples, preds)
-                p_mask.append(1)
+    exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
-                tokens += query_tokens
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
-                segment_ids += [sequence_b_segment_id] * len(query_tokens)
-                p_mask += [1] * len(query_tokens)
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
-            # SEP token
+    if has_answer_qids:
-            tokens.append(sep_token)
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
-            segment_ids.append(sequence_b_segment_id)
+        merge_eval(evaluation, has_ans_eval, 'HasAns')
-            p_mask.append(1)
+    if no_answer_qids:
-            # CLS token at the end
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
-            if cls_token_at_end:
+        merge_eval(evaluation, no_ans_eval, 'NoAns')
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
+    if no_answer_probs:
-                p_mask.append(0)
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
-                cls_index = len(tokens) - 1  # Index of classification token
+    return evaluation
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-            # tokens are attended to.
+    """Project the tokenized prediction back to the original text."""
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+    # When we created the data, we kept track of the alignment between original
-            # Zero-pad up to the sequence length.
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-            while len(input_ids) < max_seq_length:
+    # now `orig_text` contains the span of our original text corresponding to the
-                input_ids.append(pad_token)
+    # span that we predicted.
-                input_mask.append(0 if mask_padding_with_zero else 1)
-                segment_ids.append(pad_token_segment_id)
-                p_mask.append(1)
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-            span_is_impossible = example.is_impossible
-            start_position = None
-            end_position = None
-            if is_training and not span_is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    span_is_impossible = True
-                else:
-                    if sequence_a_is_doc:
-                        doc_offset = 0
-                    else:
-                        doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and span_is_impossible:
-                start_position = cls_index
-                end_position = cls_index
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info(
-                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and span_is_impossible:
-                    logger.info("impossible example")
-                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info(
-                        "answer: %s" % (answer_text))
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    cls_index=cls_index,
-                    p_mask=p_mask,
-                    paragraph_len=paragraph_len,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=span_is_impossible))
-            unique_id += 1
-    return features
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # However, `orig_text` may contain extra characters that we don't want in
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # our prediction.
-    # the exact answer, 1895.
    #
-    # However, this is not always possible. Consider the following:
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
    #
-    #   Question: What country is the top exporter of electornics?
+    # We don't want to return `orig_text` because it contains the extra "'s".
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
+    # We don't want to return `pred_text` because it's already been normalized
-    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # our tokenizer does additional normalization like stripping accent
-    # in SQuAD, but does happen.
+    # characters).
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-    return (input_start, input_end)
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
+    # What we really want to return is "Steve Smith".
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
    #
-    # In the example the maximum context for 'bought' would be span C since
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # it has 1 left context and 3 right context, while span B has 4 left context
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # and 0 right context.
+    # can fail in certain cases in which case we just return `orig_text`.
-    best_score = None
-    best_span_index = None
+    def _strip_spaces(text):
-    for (span_index, doc_span) in enumerate(doc_spans):
+        ns_chars = []
-        end = doc_span.start + doc_span.length - 1
+        ns_to_s_map = collections.OrderedDict()
-        if position < doc_span.start:
+        for (i, c) in enumerate(text):
-            continue
+            if c == " ":
-        if position > end:
+                continue
-            continue
+            ns_to_s_map[len(ns_chars)] = i
-        num_left_context = position - doc_span.start
+            ns_chars.append(c)
-        num_right_context = end - position
+        ns_text = "".join(ns_chars)
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        return (ns_text, ns_to_s_map)
-        if best_score is None or score > best_score:
-            best_score = score
+    # We first tokenize `orig_text`, strip whitespace from the result
-            best_span_index = span_index
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
-    return cur_span_index == best_span_index
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-def write_predictions(all_examples, all_features, all_results, n_best_size,
+def compute_predictions_logits(
-                      max_answer_length, do_lower_case, output_prediction_file,
+    all_examples,
-                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+    all_features,
-                      version_2_with_negative, null_score_diff_threshold):
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold
+):
    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
    logger.info("Writing nbest to: %s" % (output_nbest_file))
@@ -626,12 +507,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                        text="",
                        start_logit=null_start_logit,
                        end_logit=null_end_logit))
            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest)==1:
+            if len(nbest) == 1:
                nbest.insert(0,
-                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                             _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@@ -688,18 +569,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
    return all_predictions
-# For XLNet (and XLM which uses the same head)
+def compute_predictions_log_probs(
-RawResultExtended = collections.namedtuple("RawResultExtended",
+    all_examples,
-    ["unique_id", "start_top_log_probs", "start_top_index",
+    all_features,
-     "end_top_log_probs", "end_top_index", "cls_logits"])
+    all_results,
+    n_best_size,
+    max_answer_length,
-def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
+    output_prediction_file,
-                                max_answer_length, output_prediction_file,
+    output_nbest_file,
-                                output_nbest_file,
+    output_null_log_odds_file,
-                                output_null_log_odds_file, orig_data_file,
+    start_n_top,
-                                start_n_top, end_n_top, version_2_with_negative,
+    end_n_top,
-                                tokenizer, verbose_logging):
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging
+):
    """ XLNet write prediction logic (more complex than Bert's).
        Write final predictions to the json file and log-odds of null if needed.
@@ -708,7 +592,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["feature_index", "start_index", "end_index",
-        "start_log_prob", "end_log_prob"])
+         "start_log_prob", "end_log_prob"])
    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
@@ -745,12 +629,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            for i in range(start_n_top):
                for j in range(end_n_top):
-                    start_log_prob = result.start_top_log_probs[i]
+                    start_log_prob = result.start_logits[i]
                    start_index = result.start_top_index[i]
                    j_index = i * end_n_top + j
-                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_log_prob = result.end_logits[j_index]
                    end_index = result.end_top_index[j_index]
                    # We could hypothetically create invalid predictions, e.g., predict
@@ -791,7 +675,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
            # XLNet un-tokenizer
            # Let's keep it simple for now and see if we need all this later.
-            # 
+            #
            # tok_start_to_orig_index = feature.tok_start_to_orig_index
            # tok_end_to_orig_index = feature.tok_end_to_orig_index
            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
@@ -871,146 +755,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-    with open(orig_data_file, "r", encoding='utf-8') as reader:
+    return all_predictions
-        orig_data = json.load(reader)["data"]
-    qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
-    out_eval = {}
-    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
-    return out_eval
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
+from tqdm import tqdm
+import collections
+import logging
+import os
+import json
+import numpy as np
+from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
+from .utils import DataProcessor, InputExample, InputFeatures
+from ...file_utils import is_tf_available, is_torch_available
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+if is_tf_available():
+    import tensorflow as tf
+logger = logging.getLogger(__name__)
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                        orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+    return (input_start, input_end)
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+        # return True
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                       doc_stride, max_query_length, is_training, 
+                                       return_dataset=False):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model.
+    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+    Args:
+        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset,
+            if 'tf': returns a tf.data.Dataset
+    Returns:
+        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+    Example::
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples(data_dir)
+        features = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
+    """
+    # Defining helper methods    
+    unique_id = 1000000000
+    features = []
+    for (example_index, example) in enumerate(tqdm(examples)):
+        if is_training and not example.is_impossible:
+            # Get start and end position
+            start_position = example.start_position
+            end_position = example.end_position
+            # If the answer cannot be found in the text, then skip this example.
+            actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+            cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+                continue
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+            )
+        spans = []
+        truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
+        span_doc_tokens = all_doc_tokens
+        while len(spans) * doc_stride < len(all_doc_tokens):
+            encoded_dict = tokenizer.encode_plus(
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
+                max_length=max_seq_length, 
+                return_overflowing_tokens=True, 
+                pad_to_max_length=True,
+                stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+            )
+            paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+            if tokenizer.pad_token_id in encoded_dict['input_ids']: 
+                non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            else:
+                non_padded_ids = encoded_dict['input_ids']
+            tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+            token_to_orig_map = {}
+            for i in range(paragraph_len):
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i 
+                token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+            encoded_dict["paragraph_len"] = paragraph_len
+            encoded_dict["tokens"] = tokens
+            encoded_dict["token_to_orig_map"] = token_to_orig_map
+            encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+            encoded_dict["token_is_max_context"] = {}
+            encoded_dict["start"] = len(spans) * doc_stride
+            encoded_dict["length"] = paragraph_len
+            spans.append(encoded_dict)
+            if "overflowing_tokens" not in encoded_dict:
+                break
+            span_doc_tokens = encoded_dict["overflowing_tokens"]
+        for doc_span_index in range(len(spans)):
+            for j in range(spans[doc_span_index]["paragraph_len"]):
+                is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+                index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+        for span in spans:
+            # Identify the position of the CLS token
+            cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = np.array(span['token_type_ids'])
+            p_mask = np.minimum(p_mask, 1)
+            if tokenizer.padding_side == "right":
+                # Limit positive values to one
+                p_mask = 1 - p_mask
+            p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
+            # Set the CLS index to '0'
+            p_mask[cls_index] = 0
+            span_is_impossible = example.is_impossible
+            start_position = 0
+            end_position = 0
+            if is_training and not span_is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = span["start"]
+                doc_end = span["start"] + span["length"] - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = cls_index
+                    end_position = cls_index
+                    span_is_impossible = True
+                else:
+                    if tokenizer.padding_side == "left":
+                        doc_offset = 0
+                    else:
+                        doc_offset = len(truncated_query) + sequence_added_tokens
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            features.append(SquadFeatures(
+                span['input_ids'],
+                span['attention_mask'],
+                span['token_type_ids'],
+                cls_index,
+                p_mask.tolist(),
+                example_index=example_index,
+                unique_id=unique_id,
+                paragraph_len=span['paragraph_len'],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position
+            ))
+            unique_id += 1
+    if return_dataset == 'pt':
+        if not is_torch_available():
+            raise ImportError("Pytorch must be installed to return a pytorch dataset.")
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        if not is_training:
+            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                    all_example_index, all_cls_index, all_p_mask)
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                    all_start_positions, all_end_positions,
+                                    all_cls_index, all_p_mask)
+        return features, dataset
+    return features
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set.
+    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+    """
+    train_file = None
+    dev_file = None
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
+            answer_start = tensor_dict['answers']['answer_start'][0].numpy()
+            answers = []
+        else:
+            answers = [{
+                "answer_start": start.numpy(), 
+                "text": text.numpy().decode('utf-8')
+            } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
+            answer = None
+            answer_start = None
+        return SquadExample(
+            qas_id=tensor_dict['id'].numpy().decode("utf-8"),
+            question_text=tensor_dict['question'].numpy().decode('utf-8'),
+            context_text=tensor_dict['context'].numpy().decode('utf-8'),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict['title'].numpy().decode('utf-8'),
+            answers=answers
+        )
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+        Args:
+            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            evaluate: boolean specifying if in evaluation mode or in training mode
+        Returns:
+            List of SquadExample
+        Examples::
+            import tensorflow_datasets as tfds
+            dataset = tfds.load("squad")
+            training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        """
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
+        return examples
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+        with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+        with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry['title']
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+                    if "is_impossible" in qa:
+                        is_impossible = qa["is_impossible"]
+                    else:
+                        is_impossible = False
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer['text']
+                            start_position_character = answer['answer_start']
+                        else:
+                            answers = qa["answers"]
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers
+                    )
+                    examples.append(example)
+        return examples
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 context_text,
+                 answer_text,
+                 start_position_character,
+                 title,
+                 answers=[],
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible 
+        self.answers = answers
+        self.start_position, self.end_position = 0, 0
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+        # Start end end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
+class SquadFeatures(object):
+    """
+    Single squad example features to be fed to a model.
+    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
+            If a token does not have their maximum context in this feature object, it means that another feature object
+            has more information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index 
+        end_position: end of the answer token index 
+    """
+    def __init__(self,
+                 input_ids,
+                 attention_mask,
+                 token_type_ids,
+                 cls_index,
+                 p_mask,
+                 example_index,
+                 unique_id,
+                 paragraph_len,
+                 token_is_max_context,
+                 tokens,
+                 token_to_orig_map,
+                 start_position,
+                 end_position
+        ):
+        self.input_ids = input_ids 
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.start_position = start_position
+        self.end_position = end_position
+class SquadResult(object):
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
\ No newline at end of file