Merge branch 'master' into fix-xlnet-squad2.0

562f8640 · Thomas Wolf · GitHub · ca99a2d5 · 8618bf15 · 562f8640
Unverified Commit 562f8640 authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
20 changed files
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
+# MIT License
+# Copyright (c) 2019 Yang Liu and the HuggingFace team
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn.init import xavier_uniform_
+from transformers import BertModel, BertConfig, PreTrainedModel
+from configuration_bertabs import BertAbsConfig
+MAX_SIZE = 5000
+BERTABS_FINETUNED_MODEL_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin",
+}
+class BertAbsPreTrainedModel(PreTrainedModel):
+    config_class = BertAbsConfig
+    pretrained_model_archive_map = BERTABS_FINETUNED_MODEL_MAP
+    load_tf_weights = False
+    base_model_prefix = "bert"
+class BertAbs(BertAbsPreTrainedModel):
+    def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
+        super(BertAbs, self).__init__(args)
+        self.args = args
+        self.bert = Bert()
+        # If pre-trained weights are passed for Bert, load these.
+        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
+        if load_bert_pretrained_extractive:
+            self.bert.model.load_state_dict(
+                dict(
+                    [
+                        (n[11:], p)
+                        for n, p in bert_extractive_checkpoint.items()
+                        if n.startswith("bert.model")
+                    ]
+                ),
+                strict=True,
+            )
+        self.vocab_size = self.bert.model.config.vocab_size
+        if args.max_pos > 512:
+            my_pos_embeddings = nn.Embedding(
+                args.max_pos, self.bert.model.config.hidden_size
+            )
+            my_pos_embeddings.weight.data[
+                :512
+            ] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[
+                512:
+            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+                None, :
+            ].repeat(
+                args.max_pos - 512, 1
+            )
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
+        )
+        tgt_embeddings.weight = copy.deepcopy(
+            self.bert.model.embeddings.word_embeddings.weight
+        )
+        self.decoder = TransformerDecoder(
+            self.args.dec_layers,
+            self.args.dec_hidden_size,
+            heads=self.args.dec_heads,
+            d_ff=self.args.dec_ff_size,
+            dropout=self.args.dec_dropout,
+            embeddings=tgt_embeddings,
+            vocab_size=self.vocab_size,
+        )
+        gen_func = nn.LogSoftmax(dim=-1)
+        self.generator = nn.Sequential(
+            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
+        )
+        self.generator[0].weight = self.decoder.embeddings.weight
+        load_from_checkpoints = False if checkpoint is None else True
+        if load_from_checkpoints:
+            self.load_state_dict(checkpoint)
+    def init_weights(self):
+        for module in self.decoder.modules():
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        for p in self.generator.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                p.data.zero_()
+    def forward(
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
+    ):
+        encoder_output = self.bert(
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
+        )
+        encoder_hidden_states = encoder_output[0]
+        dec_state = self.decoder.init_decoder_state(
+            encoder_input_ids, encoder_hidden_states
+        )
+        decoder_outputs, _ = self.decoder(
+            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
+        )
+        return decoder_outputs
+class Bert(nn.Module):
+    """ This class is not really necessary and should probably disappear.
+    """
+    def __init__(self):
+        super(Bert, self).__init__()
+        config = BertConfig.from_pretrained("bert-base-uncased")
+        self.model = BertModel(config)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
+        self.eval()
+        with torch.no_grad():
+            encoder_outputs, _ = self.model(
+                input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                **kwargs
+            )
+        return encoder_outputs
+class TransformerDecoder(nn.Module):
+    """
+    The Transformer decoder from "Attention is All You Need".
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
+        super(TransformerDecoder, self).__init__()
+        # Basic attributes.
+        self.decoder_type = "transformer"
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+    # forward(input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask)
+    # def forward(self, input_ids, state, attention_mask=None, memory_lengths=None,
+    # step=None, cache=None, encoder_attention_mask=None, encoder_hidden_states=None, memory_masks=None):
+    def forward(
+        self,
+        input_ids,
+        encoder_hidden_states=None,
+        state=None,
+        attention_mask=None,
+        memory_lengths=None,
+        step=None,
+        cache=None,
+        encoder_attention_mask=None,
+    ):
+        """
+        See :obj:`onmt.modules.RNNDecoderBase.forward()`
+        memory_bank = encoder_hidden_states
+        """
+        # Name conversion
+        tgt = input_ids
+        memory_bank = encoder_hidden_states
+        memory_mask = encoder_attention_mask
+        # src_words = state.src
+        src_words = state.src
+        src_batch, src_len = src_words.size()
+        padding_idx = self.embeddings.padding_idx
+        # Decoder padding mask
+        tgt_words = tgt
+        tgt_batch, tgt_len = tgt_words.size()
+        tgt_pad_mask = (
+            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
+        )
+        # Encoder padding mask
+        if memory_mask is not None:
+            src_len = memory_mask.size(-1)
+            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = (
+                src_words.data.eq(padding_idx)
+                .unsqueeze(1)
+                .expand(src_batch, tgt_len, src_len)
+            )
+        # Pass through the embeddings
+        emb = self.embeddings(input_ids)
+        output = self.pos_emb(emb, step)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+        if state.cache is None:
+            saved_inputs = []
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if state.cache is None:
+                if state.previous_input is not None:
+                    prev_layer_input = state.previous_layer_inputs[i]
+            output, all_input = self.transformer_layers[i](
+                output,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=prev_layer_input,
+                layer_cache=state.cache["layer_{}".format(i)]
+                if state.cache is not None
+                else None,
+                step=step,
+            )
+            if state.cache is None:
+                saved_inputs.append(all_input)
+        if state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+        output = self.layer_norm(output)
+        if state.cache is None:
+            state = state.update_state(tgt, saved_inputs)
+        # Decoders in transformers return a tuple. Beam search will fail
+        # if we don't follow this convention.
+        return output, state  # , state
+    def init_decoder_state(self, src, memory_bank, with_cache=False):
+        """ Init decoder state """
+        state = TransformerDecoderState(src)
+        if with_cache:
+            state._init_cache(memory_bank, self.num_layers)
+        return state
+class PositionalEncoding(nn.Module):
+    def __init__(self, dropout, dim, max_len=5000):
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
+        )
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer("pe", pe)
+        self.dropout = nn.Dropout(p=dropout)
+        self.dim = dim
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if step:
+            emb = emb + self.pe[:, step][:, None, :]
+        else:
+            emb = emb + self.pe[:, : emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+    def get_emb(self, emb):
+        return self.pe[:, : emb.size(1)]
+class TransformerDecoderLayer(nn.Module):
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer("mask", mask)
+    def forward(
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
+    ):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
+        )
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )
+        query = self.drop(query) + inputs
+        query_norm = self.layer_norm_2(query)
+        mid = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
+        )
+        output = self.feed_forward(self.drop(mid) + query)
+        return output, all_input
+        # return output
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+        Args:
+            size: int
+        Returns:
+            (`LongTensor`):
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+    .. mermaid::
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+    Also includes several additional tricks.
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+        super(MultiHeadedAttention, self).__init__()
+        self.head_count = head_count
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if self.use_final_linear:
+            self.final_linear = nn.Linear(model_dim, model_dim)
+    def forward(
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
+    ):
+        """
+        Compute the context vector and the attention vectors.
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+        key_len = key.size(1)
+        query_len = query.size(1)
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
+        def unshape(x):
+            """  compute context """
+            return (
+                x.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, -1, head_count * dim_per_head)
+            )
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == "self":
+                query, key, value = (
+                    self.linear_query(query),
+                    self.linear_keys(query),
+                    self.linear_values(query),
+                )
+                key = shape(key)
+                value = shape(value)
+                if layer_cache is not None:
+                    device = key.device
+                    if layer_cache["self_keys"] is not None:
+                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
+                    if layer_cache["self_values"] is not None:
+                        value = torch.cat(
+                            (layer_cache["self_values"].to(device), value), dim=2
+                        )
+                    layer_cache["self_keys"] = key
+                    layer_cache["self_values"] = value
+            elif type == "context":
+                query = self.linear_query(query)
+                if layer_cache is not None:
+                    if layer_cache["memory_keys"] is None:
+                        key, value = self.linear_keys(key), self.linear_values(value)
+                        key = shape(key)
+                        value = shape(value)
+                    else:
+                        key, value = (
+                            layer_cache["memory_keys"],
+                            layer_cache["memory_values"],
+                        )
+                    layer_cache["memory_keys"] = key
+                    layer_cache["memory_values"] = value
+                else:
+                    key, value = self.linear_keys(key), self.linear_values(value)
+                    key = shape(key)
+                    value = shape(value)
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+        query = shape(query)
+        key_len = key.size(2)
+        query_len = query.size(2)
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+        # 3) Apply attention dropout and compute context vectors.
+        attn = self.softmax(scores)
+        if not predefined_graph_1 is None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            return context
+class DecoderState(object):
+    """Interface for grouping together the current state of a recurrent
+    decoder. In the simplest case just represents the hidden state of
+    the model.  But can also be used for implementing various forms of
+    input_feeding and non-recurrent models.
+    Modules need to implement this to utilize beam search decoding.
+    """
+    def detach(self):
+        """ Need to document this """
+        self.hidden = tuple([_.detach() for _ in self.hidden])
+        self.input_feed = self.input_feed.detach()
+    def beam_update(self, idx, positions, beam_size):
+        """ Need to document this """
+        for e in self._all:
+            sizes = e.size()
+            br = sizes[1]
+            if len(sizes) == 3:
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
+                    :, :, idx
+                ]
+            else:
+                sent_states = e.view(
+                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
+                )[:, :, idx]
+            sent_states.data.copy_(sent_states.data.index_select(1, positions))
+    def map_batch_fn(self, fn):
+        raise NotImplementedError()
+class TransformerDecoderState(DecoderState):
+    """ Transformer Decoder state base class """
+    def __init__(self, src):
+        """
+        Args:
+            src (FloatTensor): a sequence of source words tensors
+                    with optional feature tensors, of size (len x batch).
+        """
+        self.src = src
+        self.previous_input = None
+        self.previous_layer_inputs = None
+        self.cache = None
+    @property
+    def _all(self):
+        """
+        Contains attributes that need to be updated in self.beam_update().
+        """
+        if self.previous_input is not None and self.previous_layer_inputs is not None:
+            return (self.previous_input, self.previous_layer_inputs, self.src)
+        else:
+            return (self.src,)
+    def detach(self):
+        if self.previous_input is not None:
+            self.previous_input = self.previous_input.detach()
+        if self.previous_layer_inputs is not None:
+            self.previous_layer_inputs = self.previous_layer_inputs.detach()
+        self.src = self.src.detach()
+    def update_state(self, new_input, previous_layer_inputs):
+        state = TransformerDecoderState(self.src)
+        state.previous_input = new_input
+        state.previous_layer_inputs = previous_layer_inputs
+        return state
+    def _init_cache(self, memory_bank, num_layers):
+        self.cache = {}
+        for l in range(num_layers):
+            layer_cache = {"memory_keys": None, "memory_values": None}
+            layer_cache["self_keys"] = None
+            layer_cache["self_values"] = None
+            self.cache["layer_{}".format(l)] = layer_cache
+    def repeat_beam_size_times(self, beam_size):
+        """ Repeat beam_size times along batch dimension. """
+        self.src = self.src.data.repeat(1, beam_size, 1)
+    def map_batch_fn(self, fn):
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+        self.src = fn(self.src, 0)
+        if self.cache is not None:
+            _recursive_map(self.cache)
+def gelu(x):
+    return (
+        0.5
+        * x
+        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    )
+class PositionwiseFeedForward(nn.Module):
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.actv = gelu
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+#
+# TRANSLATOR
+# The following code is used to generate summaries using the
+# pre-trained weights and beam search.
+#
+def build_predictor(args, tokenizer, symbols, model, logger=None):
+    # we should be able to refactor the global scorer a lot
+    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
+    translator = Translator(
+        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
+    )
+    return translator
+class GNMTGlobalScorer(object):
+    """
+    NMT re-ranking score from
+    "Google's Neural Machine Translation System" :cite:`wu2016google`
+    Args:
+       alpha (float): length parameter
+       beta (float):  coverage parameter
+    """
+    def __init__(self, alpha, length_penalty):
+        self.alpha = alpha
+        penalty_builder = PenaltyBuilder(length_penalty)
+        self.length_penalty = penalty_builder.length_penalty()
+    def score(self, beam, logprobs):
+        """
+        Rescores a prediction based on penalty functions
+        """
+        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
+        return normalized_probs
+class PenaltyBuilder(object):
+    """
+    Returns the Length and Coverage Penalty function for Beam Search.
+    Args:
+        length_pen (str): option name of length pen
+        cov_pen (str): option name of cov pen
+    """
+    def __init__(self, length_pen):
+        self.length_pen = length_pen
+    def length_penalty(self):
+        if self.length_pen == "wu":
+            return self.length_wu
+        elif self.length_pen == "avg":
+            return self.length_average
+        else:
+            return self.length_none
+    """
+    Below are all the different penalty terms implemented so far
+    """
+    def length_wu(self, beam, logprobs, alpha=0.0):
+        """
+        NMT length re-ranking score from
+        "Google's Neural Machine Translation System" :cite:`wu2016google`.
+        """
+        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)
+        return logprobs / modifier
+    def length_average(self, beam, logprobs, alpha=0.0):
+        """
+        Returns the average probability of tokens in a sequence.
+        """
+        return logprobs / len(beam.next_ys)
+    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
+        """
+        Returns unmodified scores.
+        """
+        return logprobs
+class Translator(object):
+    """
+    Uses a model to translate a batch of sentences.
+    Args:
+       model (:obj:`onmt.modules.NMTModel`):
+          NMT model to use for translation
+       fields (dict of Fields): data fields
+       beam_size (int): size of beam to use
+       n_best (int): number of translations produced
+       max_length (int): maximum length output to produce
+       global_scores (:obj:`GlobalScorer`):
+         object to rescore final translations
+       copy_attn (bool): use copy attention during translation
+       beam_trace (bool): trace beam search for debugging
+       logger(logging.Logger): logger.
+    """
+    def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
+        self.logger = logger
+        self.args = args
+        self.model = model
+        self.generator = self.model.generator
+        self.vocab = vocab
+        self.symbols = symbols
+        self.start_token = symbols["BOS"]
+        self.end_token = symbols["EOS"]
+        self.global_scorer = global_scorer
+        self.beam_size = args.beam_size
+        self.min_length = args.min_length
+        self.max_length = args.max_length
+    def translate(self, batch, step, attn_debug=False):
+        """ Generates summaries from one batch of data.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+        return translations
+    def translate_batch(self, batch, fast=False):
+        """
+        Translate a batch of sentences.
+        Mostly a wrapper around :obj:`Beam`.
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length
+            )
+    # Where the beam search lives
+    # I have no idea why it is being called from the method above
+    def _fast_translate_batch(self, batch, max_length, min_length=0):
+        """ Beam Search using the encoder inputs contained in `batch`.
+        """
+        # The batch object is funny
+        # Instead of just looking at the size of the arguments we encapsulate
+        # a size argument.
+        # Where is it defined?
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        segs = batch.segs
+        mask_src = batch.mask_src
+        src_features = self.model.bert(src, segs, mask_src)
+        dec_states = self.model.decoder.init_decoder_state(
+            src, src_features, with_cache=True
+        )
+        device = src_features.device
+        # Tile states and memory beam_size times.
+        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
+        src_features = tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
+        )
+        alive_seq = torch.full(
+            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
+        )
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = torch.tensor(
+            [0.0] + [float("-inf")] * (beam_size - 1), device=device
+        ).repeat(batch_size)
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+        results = {}
+        results["predictions"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["scores"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["gold_score"] = [0] * batch_size
+        results["batch"] = batch
+        for step in range(max_length):
+            decoder_input = alive_seq[:, -1].view(1, -1)
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+            dec_out, dec_states = self.model.decoder(
+                decoder_input, src_features, dec_states, step=step
+            )
+            # Generator forward.
+            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+            # Multiply probs by the beam probability.
+            log_probs += topk_log_probs.view(-1).unsqueeze(1)
+            alpha = self.global_scorer.alpha
+            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha
+            # Flatten probs into a list of possibilities.
+            curr_scores = log_probs / length_penalty
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        words = [self.vocab.ids_to_tokens[w] for w in words]
+                        words = " ".join(words).replace(" ##", "").split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [
+                            (words[i - 1], words[i], words[i + 1])
+                            for i in range(1, len(words) - 1)
+                        ]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+            # Resolve beam origin and true word ids.
+            topk_beam_index = topk_ids.div(vocab_size)
+            topk_ids = topk_ids.fmod(vocab_size)
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = topk_beam_index + beam_offset[
+                : topk_beam_index.size(0)
+            ].unsqueeze(1)
+            select_indices = batch_index.view(-1)
+            # Append last prediction.
+            alive_seq = torch.cat(
+                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
+            )
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(1)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(1)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True)
+                        score, pred = best_hyp[0]
+                        results["scores"][b].append(score)
+                        results["predictions"][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished).view(
+                    -1, alive_seq.size(-1)
+                )
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            dec_states.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices)
+            )
+        return results
+    def from_batch(self, translation_batch):
+        batch = translation_batch["batch"]
+        assert len(translation_batch["gold_score"]) == len(translation_batch["predictions"])
+        batch_size = batch.batch_size
+        preds, _, _, tgt_str, src = (
+            translation_batch["predictions"],
+            translation_batch["scores"],
+            translation_batch["gold_score"],
+            batch.tgt_str,
+            batch.src,
+        )
+        translations = []
+        for b in range(batch_size):
+            pred_sents = self.vocab.convert_ids_to_tokens([int(n) for n in preds[b][0]])
+            pred_sents = " ".join(pred_sents).replace(" ##", "")
+            gold_sent = " ".join(tgt_str[b].split())
+            raw_src = [self.vocab.ids_to_tokens[int(t)] for t in src[b]][:500]
+            raw_src = " ".join(raw_src)
+            translation = (pred_sents, gold_sent, raw_src)
+            translations.append(translation)
+        return translations
+def tile(x, count, dim=0):
+    """
+    Tiles x on dimension dim count times.
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = (
+        x.view(batch, -1)
+        .transpose(0, 1)
+        .repeat(count, 1)
+        .transpose(0, 1)
+        .contiguous()
+        .view(*out_size)
+    )
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
+#
+# Optimizer for training. We keep this here in case we want to add
+# a finetuning script.
+#
+class BertSumOptimizer(object):
+    """ Specific optimizer for BertSum.
+    As described in [1], the authors fine-tune BertSum for abstractive
+    summarization using two Adam Optimizers with different warm-up steps and
+    learning rate. They also use a custom learning rate scheduler.
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    """
+    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
+        self.encoder = model.encoder
+        self.decoder = model.decoder
+        self.lr = lr
+        self.warmup_steps = warmup_steps
+        self.optimizers = {
+            "encoder": torch.optim.Adam(
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+            "decoder": torch.optim.Adam(
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+        }
+        self._step = 0
+        self.current_learning_rates = {}
+    def _update_rate(self, stack):
+        return self.lr[stack] * min(
+            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
+        )
+    def zero_grad(self):
+        self.optimizer_decoder.zero_grad()
+        self.optimizer_encoder.zero_grad()
+    def step(self):
+        self._step += 1
+        for stack, optimizer in self.optimizers.items():
+            new_rate = self._update_rate(stack)
+            for param_group in optimizer.param_groups:
+                param_group["lr"] = new_rate
+            optimizer.step()
+            self.current_learning_rates[stack] = new_rate
--- a/examples/summarization/requirements.txt
+++ b/examples/summarization/requirements.txt
+# progress bars in model download and training scripts
+tqdm
+# Accessing files from S3 directly.
+boto3
+# Used for downloading models over HTTP
+requests
+# For ROUGE
+nltk
+py-rouge
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
+#! /usr/bin/python3
+import argparse
+from collections import namedtuple
+import logging
+import os
+import sys
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from tqdm import tqdm
+from transformers import BertTokenizer
+from modeling_bertabs import BertAbs, build_predictor
+from utils_summarization import (
+    SummarizationDataset,
+    encode_for_summarization,
+    build_mask,
+    fit_to_block_size,
+    compute_token_type_ids,
+)
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+Batch = namedtuple(
+    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
+)
+def evaluate(args):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
+    model.to(args.device)
+    model.eval()
+    symbols = {
+        "BOS": tokenizer.vocab["[unused0]"],
+        "EOS": tokenizer.vocab["[unused1]"],
+        "PAD": tokenizer.vocab["[PAD]"],
+    }
+    if args.compute_rouge:
+        reference_summaries = []
+        generated_summaries = []
+        import rouge
+        import nltk
+        nltk.download('punkt')
+        rouge_evaluator = rouge.Rouge(
+            metrics=['rouge-n', 'rouge-l'],
+            max_n=2,
+            limit_length=True,
+            length_limit=args.beam_size,
+            length_limit_type='words',
+            apply_avg=True,
+            apply_best=False,
+            alpha=0.5,  # Default F1_score
+            weight_factor=1.2,
+            stemming=True,
+        )
+    # these (unused) arguments are defined to keep the compatibility
+    # with the legacy code and will be deleted in a next iteration.
+    args.result_path = ""
+    args.temp_dir = ""
+    data_iterator = build_data_iterator(args, tokenizer)
+    predictor = build_predictor(args, tokenizer, symbols, model)
+    logger.info("***** Running evaluation *****")
+    logger.info("  Number examples = %d", len(data_iterator.dataset))
+    logger.info("  Batch size = %d", args.batch_size)
+    logger.info("")
+    logger.info("***** Beam Search parameters *****")
+    logger.info("  Beam size = %d", args.beam_size)
+    logger.info("  Minimum length = %d", args.min_length)
+    logger.info("  Maximum length = %d", args.max_length)
+    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
+    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
+    for batch in tqdm(data_iterator):
+        batch_data = predictor.translate_batch(batch)
+        translations = predictor.from_batch(batch_data)
+        summaries = [format_summary(t) for t in translations]
+        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
+        if args.compute_rouge:
+            reference_summaries += batch.tgt_str
+            generated_summaries += summaries
+    if args.compute_rouge:
+        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
+        str_scores = format_rouge_scores(scores)
+        save_rouge_scores(str_scores)
+        print(str_scores)
+def save_summaries(summaries, path, original_document_name):
+    """ Write the summaries in fies that are prefixed by the original
+    files' name with the `_summary` appended.
+    Attributes:
+        original_document_names: List[string]
+            Name of the document that was summarized.
+        path: string
+            Path were the summaries will be written
+        summaries: List[string]
+            The summaries that we produced.
+    """
+    for summary, document_name in zip(summaries, original_document_name):
+        # Prepare the summary file's name
+        if "." in document_name:
+            bare_document_name = ".".join(document_name.split(".")[:-1])
+            extension = document_name.split(".")[-1]
+            name = bare_document_name + "_summary." + extension
+        else:
+            name = document_name + "_summary"
+        file_path = os.path.join(path, name)
+        with open(file_path, "w") as output:
+            output.write(summary)
+def format_summary(translation):
+    """ Transforms the output of the `from_batch` function
+    into nicely formatted summaries.
+    """
+    raw_summary, _, _ = translation
+    summary = (
+        raw_summary.replace("[unused0]", "")
+        .replace("[unused3]", "")
+        .replace("[PAD]", "")
+        .replace("[unused1]", "")
+        .replace(r" +", " ")
+        .replace(" [unused2] ", ". ")
+        .replace("[unused2]", "")
+        .strip()
+    )
+    return summary
+def format_rouge_scores(scores):
+    return """\n
+****** ROUGE SCORES ******
+** ROUGE 1
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+** ROUGE 2
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+** ROUGE L
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}""".format(
+        scores['rouge-1']['f'],
+        scores['rouge-1']['p'],
+        scores['rouge-1']['r'],
+        scores['rouge-2']['f'],
+        scores['rouge-2']['p'],
+        scores['rouge-2']['r'],
+        scores['rouge-l']['f'],
+        scores['rouge-l']['p'],
+        scores['rouge-l']['r'],
+    )
+def save_rouge_scores(str_scores):
+    with open("rouge_scores.txt", "w") as output:
+        output.write(str_scores)
+#
+# LOAD the dataset
+#
+def build_data_iterator(args, tokenizer):
+    dataset = load_and_cache_examples(args, tokenizer)
+    sampler = SequentialSampler(dataset)
+    collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
+    iterator = DataLoader(
+        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
+    )
+    return iterator
+def load_and_cache_examples(args, tokenizer):
+    dataset = SummarizationDataset(args.documents_dir)
+    return dataset
+def collate(data, tokenizer, block_size, device):
+    """ Collate formats the data passed to the data loader.
+    In particular we tokenize the data batch after batch to avoid keeping them
+    all in memory. We output the data as a namedtuple to fit the original BertAbs's
+    API.
+    """
+    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
+    names = [name for name, _, _ in data]
+    summaries = [" ".join(summary_list) for _, _, summary_list in data]
+    encoded_text = [
+        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
+    ]
+    encoded_stories = torch.tensor(
+        [
+            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
+            for story, _ in encoded_text
+        ]
+    )
+    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
+    batch = Batch(
+        document_names=names,
+        batch_size=len(encoded_stories),
+        src=encoded_stories.to(device),
+        segs=encoder_token_type_ids.to(device),
+        mask_src=encoder_mask.to(device),
+        tgt_str=summaries,
+    )
+    return batch
+def decode_summary(summary_tokens, tokenizer):
+    """ Decode the summary and return it in a format
+    suitable for evaluation.
+    """
+    summary_tokens = summary_tokens.to("cpu").numpy()
+    summary = tokenizer.decode(summary_tokens)
+    sentences = summary.split(".")
+    sentences = [s + "." for s in sentences]
+    return sentences
+def main():
+    """ The main function defines the interface with the users.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--documents_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The folder where the documents to summarize are located.",
+    )
+    parser.add_argument(
+        "--summaries_output_dir",
+        default=None,
+        type=str,
+        required=False,
+        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
+    )
+    parser.add_argument(
+        "--compute_rouge",
+        default=False,
+        type=bool,
+        required=False,
+        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
+    )
+    # EVALUATION options
+    parser.add_argument(
+        "--no_cuda",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
+    )
+    parser.add_argument(
+        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    # BEAM SEARCH arguments
+    parser.add_argument(
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
+    )
+    parser.add_argument(
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
+    )
+    parser.add_argument(
+        "--block_trigram",
+        default=True,
+        type=bool,
+        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
+    )
+    args = parser.parse_args()
+    # Select device (distibuted not available)
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    # Check the existence of directories
+    if not args.summaries_output_dir:
+        args.summaries_output_dir = args.documents_dir
+    if not documents_dir_is_valid(args.documents_dir):
+        raise FileNotFoundError(
+            "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
+        )
+    os.makedirs(args.summaries_output_dir, exist_ok=True)
+    evaluate(args)
+def documents_dir_is_valid(path):
+    if not os.path.exists(path):
+        return False
+    file_list = os.listdir(path)
+    if len(file_list) == 0:
+        return False
+    return True
+if __name__ == "__main__":
+    main()
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -10,9 +10,14 @@ from torch.utils.data import Dataset
 # ------------
-class CNNDailyMailDataset(Dataset):
+class SummarizationDataset(Dataset):
    """ Abstracts the dataset used to train seq2seq models.
+    The class will process the documents that are located in the specified
+    folder. The preprocessing will work on any document that is reasonably
+    formatted. On the CNN/DailyMail dataset it will extract both the story
+    and the summary.
    CNN/Daily News:
    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
@@ -25,33 +30,33 @@ class CNNDailyMailDataset(Dataset):
    [2] https://github.com/abisee/cnn-dailymail/
    """
-    def __init__(self, tokenizer, prefix="train", data_dir=""):
+    def __init__(self, path="", prefix="train"):
-        assert os.path.isdir(data_dir)
+        """ We initialize the class by listing all the documents to summarize.
-        self.tokenizer = tokenizer
+        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
+        """
-        # We initialize the class by listing all the files that contain
+        assert os.path.isdir(path)
-        # stories and summaries. Files are not read in memory given
-        # the size of the corpus.
+        self.documents = []
-        self.stories_path = []
+        story_filenames_list = os.listdir(path)
-        datasets = ("cnn", "dailymail")
-        for dataset in datasets:
-            path_to_stories = os.path.join(data_dir, dataset, "stories")
-            story_filenames_list = os.listdir(path_to_stories)
        for story_filename in story_filenames_list:
-                path_to_story = os.path.join(path_to_stories, story_filename)
+            if "summary" in story_filename:
+                continue
+            path_to_story = os.path.join(path, story_filename)
            if not os.path.isfile(path_to_story):
                continue
-                self.stories_path.append(path_to_story)
+            self.documents.append(path_to_story)
    def __len__(self):
-        return len(self.stories_path)
+        """ Returns the number of documents. """
+        return len(self.documents)
    def __getitem__(self, idx):
-        story_path = self.stories_path[idx]
+        document_path = self.documents[idx]
-        with open(story_path, encoding="utf-8") as source:
+        document_name = document_path.split("/")[-1]
+        with open(document_path, encoding="utf-8") as source:
            raw_story = source.read()
            story_lines, summary_lines = process_story(raw_story)
-        return story_lines, summary_lines
+        return document_name, story_lines, summary_lines
 def process_story(raw_story):
@@ -81,7 +86,7 @@ def process_story(raw_story):
            story_lines.append(element)
        except IndexError:
            # if "@highlight" is absent from the file we pop
-            # all elements until there is None.
+            # all elements until there is None, raising an exception.
            return story_lines, []
    # gather summary lines
@@ -104,31 +109,22 @@ def _add_missing_period(line):
 # --------------------------
-def fit_to_block_size(sequence, block_size, pad_token):
+def fit_to_block_size(sequence, block_size, pad_token_id):
    """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter than the block size we pad it with -1 ids
+    If the sequence is shorter we append padding token to the right of the sequence.
-    which correspond to padding tokens.
    """
    if len(sequence) > block_size:
        return sequence[:block_size]
    else:
-        sequence.extend([pad_token] * (block_size - len(sequence)))
+        sequence.extend([pad_token_id] * (block_size - len(sequence)))
        return sequence
-def build_lm_labels(sequence, pad_token):
+def build_mask(sequence, pad_token_id):
-    """ Padding token, encoded as 0, are represented by the value -1 so they
-    are not taken into account in the loss computation. """
-    padded = sequence.clone()
-    padded[padded == pad_token] = -1
-    return padded
-def build_mask(sequence, pad_token):
    """ Builds the mask. The attention mechanism will only attend to positions
    with value 1. """
    mask = torch.ones_like(sequence)
-    idx_pad_tokens = sequence == pad_token
+    idx_pad_tokens = sequence == pad_token_id
    mask[idx_pad_tokens] = 0
    return mask
@@ -138,18 +134,11 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
    as specified in [1] by using `[SEP] [CLS]` tokens to separate
    sentences.
    """
-    story_lines_token_ids = [
+    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
-        for line in story_lines
-    ]
-    summary_lines_token_ids = [
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
-        for line in summary_lines
-    ]
    story_token_ids = [
        token for sentence in story_lines_token_ids for token in sentence
    ]
+    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
    summary_token_ids = [
        token for sentence in summary_lines_token_ids for token in sentence
    ]
@@ -174,7 +163,7 @@ def compute_token_type_ids(batch, separator_token_id):
    """
    batch_embeddings = []
    for sequence in batch:
-        sentence_num = 0
+        sentence_num = -1
        embeddings = []
        for s in sequence:
            if s == separator_token_id:

--- a/examples/utils_summarization_test.py
+++ b/examples/utils_summarization_test.py
@@ -21,7 +21,6 @@ from utils_summarization import (
    compute_token_type_ids,
    fit_to_block_size,
    build_mask,
-    build_lm_labels,
    process_story,
 )
@@ -88,20 +87,6 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        expected_summary_lines = ["It was the best of times."]
        self.assertEqual(expected_summary_lines, summary_lines)
-    def test_build_lm_labels_no_padding(self):
-        sequence = torch.tensor([1, 2, 3, 4])
-        expected = sequence
-        np.testing.assert_array_equal(
-            build_lm_labels(sequence, 0).numpy(), expected.numpy()
-        )
-    def test_build_lm_labels(self):
-        sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
-        expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
-        np.testing.assert_array_equal(
-            build_lm_labels(sequence, 0).numpy(), expected.numpy()
-        )
    def test_build_mask_no_padding(self):
        sequence = torch.tensor([1, 2, 3, 4])
        expected = torch.tensor([1, 1, 1, 1])
@@ -125,7 +110,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
        )
        expected = torch.tensor(
-            [[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
+            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
        )
        result = compute_token_type_ids(batch, separator)

--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
        logger.addHandler(stream_handler)
        testargs = ["run_squad.py",
-                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--data_dir=./examples/tests_samples/SQUAD",
-                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
                    "--model_name=bert-base-uncased",
                    "--output_dir=./examples/tests_samples/temp_dir",
                    "--max_steps=10",

--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
--- a/examples/tests_samples/SQUAD/train-v2.0.json
+++ b/examples/tests_samples/SQUAD/train-v2.0.json
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
\ No newline at end of file
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
-""" Official evaluation script for SQuAD version 2.0.
-    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-import argparse
-import collections
-import json
-import numpy as np
-import os
-import re
-import string
-import sys
-class EVAL_OPTS():
-  def __init__(self, data_file, pred_file, out_file="",
-               na_prob_file="na_prob.json", na_prob_thresh=1.0,
-               out_image_dir=None, verbose=False):
-    self.data_file = data_file
-    self.pred_file = pred_file
-    self.out_file = out_file
-    self.na_prob_file = na_prob_file
-    self.na_prob_thresh = na_prob_thresh
-    self.out_image_dir = out_image_dir
-    self.verbose = verbose
-OPTS = None
-def parse_args():
-  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
-  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
-  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
-  parser.add_argument('--out-file', '-o', metavar='eval.json',
-                      help='Write accuracy metrics to file (default is stdout).')
-  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
-                      help='Model estimates of probability of no answer.')
-  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
-                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
-  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
-                      help='Save precision-recall curves to directory.')
-  parser.add_argument('--verbose', '-v', action='store_true')
-  if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-  return parser.parse_args()
-def make_qid_to_has_ans(dataset):
-  qid_to_has_ans = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid_to_has_ans[qa['id']] = bool(qa['answers'])
-  return qid_to_has_ans
-def normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-  def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
-  def white_space_fix(text):
-    return ' '.join(text.split())
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
-  def lower(text):
-    return text.lower()
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-def get_tokens(s):
-  if not s: return []
-  return normalize_answer(s).split()
-def compute_exact(a_gold, a_pred):
-  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-def compute_f1(a_gold, a_pred):
-  gold_toks = get_tokens(a_gold)
-  pred_toks = get_tokens(a_pred)
-  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-  num_same = sum(common.values())
-  if len(gold_toks) == 0 or len(pred_toks) == 0:
-    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-    return int(gold_toks == pred_toks)
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(pred_toks)
-  recall = 1.0 * num_same / len(gold_toks)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-def get_raw_scores(dataset, preds):
-  exact_scores = {}
-  f1_scores = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid = qa['id']
-        gold_answers = [a['text'] for a in qa['answers']
-                        if normalize_answer(a['text'])]
-        if not gold_answers:
-          # For unanswerable questions, only correct answer is empty string
-          gold_answers = ['']
-        if qid not in preds:
-          print('Missing prediction for %s' % qid)
-          continue
-        a_pred = preds[qid]
-        # Take max over all gold answers
-        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
-        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
-  return exact_scores, f1_scores
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-  new_scores = {}
-  for qid, s in scores.items():
-    pred_na = na_probs[qid] > na_prob_thresh
-    if pred_na:
-      new_scores[qid] = float(not qid_to_has_ans[qid])
-    else:
-      new_scores[qid] = s
-  return new_scores
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-  if not qid_list:
-    total = len(exact_scores)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores.values()) / total),
-        ('f1', 100.0 * sum(f1_scores.values()) / total),
-        ('total', total),
-    ])
-  else:
-    total = len(qid_list)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        ('total', total),
-    ])
-def merge_eval(main_eval, new_eval, prefix):
-  for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-def plot_pr_curve(precisions, recalls, out_image, title):
-  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
-  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
-  plt.xlabel('Recall')
-  plt.ylabel('Precision')
-  plt.xlim([0.0, 1.05])
-  plt.ylim([0.0, 1.05])
-  plt.title(title)
-  plt.savefig(out_image)
-  plt.clf()
-def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
-                               out_image=None, title=None):
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  true_pos = 0.0
-  cur_p = 1.0
-  cur_r = 0.0
-  precisions = [1.0]
-  recalls = [0.0]
-  avg_prec = 0.0
-  for i, qid in enumerate(qid_list):
-    if qid_to_has_ans[qid]:
-      true_pos += scores[qid]
-    cur_p = true_pos / float(i+1)
-    cur_r = true_pos / float(num_true_pos)
-    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
-      # i.e., if we can put a threshold after this point
-      avg_prec += cur_p * (cur_r - recalls[-1])
-      precisions.append(cur_p)
-      recalls.append(cur_r)
-  if out_image:
-    plot_pr_curve(precisions, recalls, out_image, title)
-  return {'ap': 100.0 * avg_prec}
-def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, out_image_dir):
-  if out_image_dir and not os.path.exists(out_image_dir):
-    os.makedirs(out_image_dir)
-  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
-  if num_true_pos == 0:
-    return
-  pr_exact = make_precision_recall_eval(
-      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
-      title='Precision-Recall curve for Exact Match score')
-  pr_f1 = make_precision_recall_eval(
-      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
-      title='Precision-Recall curve for F1 score')
-  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
-  pr_oracle = make_precision_recall_eval(
-      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
-      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
-  merge_eval(main_eval, pr_exact, 'pr_exact')
-  merge_eval(main_eval, pr_f1, 'pr_f1')
-  merge_eval(main_eval, pr_oracle, 'pr_oracle')
-def histogram_na_prob(na_probs, qid_list, image_dir, name):
-  if not qid_list:
-    return
-  x = [na_probs[k] for k in qid_list]
-  weights = np.ones_like(x) / float(len(x))
-  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
-  plt.xlabel('Model probability of no-answer')
-  plt.ylabel('Proportion of dataset')
-  plt.title('Histogram of no-answer probability: %s' % name)
-  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
-  plt.clf()
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  return 100.0 * best_score / len(scores), best_thresh
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  has_ans_score, has_ans_cnt = 0, 0
-  for qid in qid_list:
-    if not qid_to_has_ans[qid]: continue
-    has_ans_cnt += 1
-    if qid not in scores: continue
-    has_ans_score += scores[qid]
-  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-  main_eval['has_ans_exact'] = has_ans_exact
-  main_eval['has_ans_f1'] = has_ans_f1
-def main(OPTS):
-  with open(OPTS.data_file) as f:
-    dataset_json = json.load(f)
-    dataset = dataset_json['data']
-  with open(OPTS.pred_file) as f:
-    preds = json.load(f)
-  if OPTS.na_prob_file:
-    with open(OPTS.na_prob_file) as f:
-      na_probs = json.load(f)
-  else:
-    na_probs = {k: 0.0 for k in preds}
-  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
-  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-  exact_raw, f1_raw = get_raw_scores(dataset, preds)
-  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
-                                        OPTS.na_prob_thresh)
-  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
-                                     OPTS.na_prob_thresh)
-  out_eval = make_eval_dict(exact_thresh, f1_thresh)
-  if has_ans_qids:
-    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    merge_eval(out_eval, has_ans_eval, 'HasAns')
-  if no_ans_qids:
-    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    merge_eval(out_eval, no_ans_eval, 'NoAns')
-  if OPTS.na_prob_file:
-    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  if OPTS.na_prob_file and OPTS.out_image_dir:
-    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, OPTS.out_image_dir)
-    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
-    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
-  if OPTS.out_file:
-    with open(OPTS.out_file, 'w') as f:
-      json.dump(out_eval, f)
-  else:
-    print(json.dumps(out_eval, indent=2))
-  return out_eval
-if __name__ == '__main__':
-  OPTS = parse_args()
-  if OPTS.out_image_dir:
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt 
-  main(OPTS)
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ boto3
 # Used for downloading models over HTTP
 requests
 # For OpenAI GPT
-regex
+regex != 2019.12.17
 # For XLNet
 sentencepiece
 # For XLM

--- a/setup.py
+++ b/setup.py
@@ -36,9 +36,17 @@ To create the package for pypi.
 from io import open
 from setuptools import find_packages, setup
+extras = {
+    'serving': ['pydantic', 'uvicorn', 'fastapi'],
+    'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'],
+    'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch']
+}
+extras['all'] = [package for package in extras.values()]
 setup(
    name="transformers",
-    version="2.1.1",
+    version="2.3.0",
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -51,18 +59,17 @@ setup(
                                    "tests.*", "tests"]),
    install_requires=['numpy',
                      'boto3',
+                      'filelock',
                      'requests',
                      'tqdm',
-                      'regex',
+                      'regex != 2019.12.17',
                      'sentencepiece',
                      'sacremoses'],
-    entry_points={
+    extras_require=extras,
-      'console_scripts': [
+    scripts=[
-        "transformers=transformers.__main__:main",
+        'transformers-cli'
-      ]
+    ],
-    },
    # python_requires='>=3.5.0',
-    tests_require=['pytest'],
    classifiers=[
          'Intended Audience :: Science/Research',
          'License :: OSI Approved :: Apache Software License',

--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=50257,
+                 vocab_size=50257,
                 n_positions=1024,
                 n_ctx=1024,
                 n_embd=768,
@@ -75,8 +75,6 @@ class XxxConfig(PretrainedConfig):
                 attn_pdrop=0.1,
                 layer_norm_epsilon=1e-5,
                 initializer_range=0.02,
-                 num_labels=1,
                 summary_type='cls_index',
                 summary_use_proj=True,
                 summary_activation=None,
@@ -84,7 +82,7 @@ class XxxConfig(PretrainedConfig):
                 summary_first_dropout=0.1,
                 **kwargs):
        super(XxxConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.vocab_size = vocab_size
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -95,23 +93,11 @@ class XxxConfig(PretrainedConfig):
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
-        self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, six.string_types):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
    @property
    def max_position_embeddings(self):

--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 import logging
 logging.basicConfig(level=logging.INFO)
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
-    config = XxxConfig.from_json_file(xxx_config_file)
+    config = XxxConfig.from_json_file(config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = XxxForPreTraining(config)
@@ -48,11 +48,11 @@ if __name__ == "__main__":
                        type = str,
                        required = True,
                        help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--xxx_config_file",
+    parser.add_argument("--config_file",
                        default = None,
                        type = str,
                        required = True,
-                        help = "The config json file corresponding to the pre-trained XXX model. \n"
+                        help = "The config json file corresponding to the pre-trained model. \n"
                            "This specifies the model architecture.")
    parser.add_argument("--pytorch_dump_path",
                        default = None,
@@ -61,5 +61,5 @@ if __name__ == "__main__":
                        help = "Path to the output PyTorch model.")
    args = parser.parse_args()
    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.xxx_config_file,
+                                     args.config_file,
                                     args.pytorch_dump_path)
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -26,13 +26,15 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 import numpy as np
 import tensorflow as tf
 from .configuration_xxx import XxxConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
@@ -121,9 +123,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
            input_ids = inputs
        if attention_mask is None:
-            attention_mask = tf.fill(tf.shape(input_ids), 1)
+            attention_mask = tf.fill(shape_list(input_ids), 1)
        if token_type_ids is None:
-            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+            token_type_ids = tf.fill(shape_list(input_ids), 0)
        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]

--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -25,6 +25,8 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 import torch

--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import XxxConfig, is_tf_available
@@ -33,10 +32,9 @@ if is_tf_available():
                                               TFXxxForTokenClassification,
                                               TFXxxForQuestionAnswering,
                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
@@ -112,7 +110,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -244,12 +242,10 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in ['xxx-base-uncased']:
-            model = TFXxxModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 from transformers import is_torch_available
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -31,10 +30,9 @@ if is_torch_available():
                                        XxxForQuestionAnswering, XxxForSequenceClassification,
                                        XxxForTokenClassification, XxxForMultipleChoice)
    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
+@require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
@@ -110,7 +108,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -131,6 +129,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -148,6 +147,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -162,6 +162,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                   start_positions=sequence_labels, end_positions=sequence_labels)
@@ -182,6 +183,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
@@ -197,6 +199,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
@@ -243,12 +246,10 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XxxModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer):
    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/transformers-cli
+++ b/transformers-cli
+#!/usr/bin/env python
+from argparse import ArgumentParser
+from transformers.commands.download import DownloadCommand
+from transformers.commands.run import RunCommand
+from transformers.commands.user import UserCommands
+from transformers.commands.convert import ConvertCommand
+from transformers.commands.serving import ServeCommand
+if __name__ == '__main__':
+    parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
+    commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
+    # Register commands
+    ConvertCommand.register_subcommand(commands_parser)
+    DownloadCommand.register_subcommand(commands_parser)
+    RunCommand.register_subcommand(commands_parser)
+    ServeCommand.register_subcommand(commands_parser)
+    UserCommands.register_subcommand(commands_parser)
+    # Let's go
+    args = parser.parse_args()
+    if not hasattr(args, 'func'):
+        parser.print_help()
+        exit(1)
+    # Run
+    service = args.func(args)
+    service.run()
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
-__version__ = "2.1.1"
+__version__ = "2.3.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -19,21 +19,29 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
                         is_tf_available, is_torch_available)
 from .data import (is_sklearn_available,
                   InputExample, InputFeatures, DataProcessor,
+                   SingleSentenceClassificationProcessor,
                   glue_output_modes, glue_convert_examples_to_features,
-                   glue_processors, glue_tasks_num_labels)
+                   glue_processors, glue_tasks_num_labels,
+                   xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
+                   squad_convert_examples_to_features, SquadFeatures, 
+                   SquadExample, SquadV1Processor, SquadV2Processor)
 if is_sklearn_available():
-    from .data import glue_compute_metrics
+    from .data import glue_compute_metrics, xnli_compute_metrics
+# Model Cards
+from .modelcard import ModelCard
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
@@ -42,26 +50,33 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_albert import AlbertTokenizer
+from .tokenization_camembert import CamembertTokenizer
+from .tokenization_t5 import T5Tokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig
+from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
 if is_torch_available():
    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead)
+                                AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                BertForMaskedLM, BertForNextSentencePrediction,
@@ -81,9 +96,10 @@ if is_torch_available():
                                CTRLLMHeadModel,
                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                                XLNetForSequenceClassification, XLNetForMultipleChoice,
+                                XLNetForSequenceClassification, XLNetForTokenClassification,
-                                XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
+                                XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
-                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
+                                XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
@@ -92,11 +108,25 @@ if is_torch_available():
                                RobertaForSequenceClassification, RobertaForMultipleChoice,
                                RobertaForTokenClassification,
                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
+    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                DistilBertForTokenClassification,
                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
+                                CamembertForSequenceClassification, CamembertForMultipleChoice,
+                                CamembertForTokenClassification,
+                                CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
+                              load_tf_weights_in_t5,
+                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
+                                AlbertForQuestionAnswering,
+                                load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
+                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
    # Optimization
    from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
@@ -105,9 +135,9 @@ if is_torch_available():
 # TensorFlow
 if is_tf_available():
-    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
+    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead)
+                                   TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                   TFBertModel, TFBertForPreTraining,
@@ -131,6 +161,7 @@ if is_tf_available():
    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
                                    TFXLNetModel, TFXLNetLMHeadModel,
                                    TFXLNetForSequenceClassification,
+                                    TFXLNetForTokenClassification,
                                    TFXLNetForQuestionAnsweringSimple,
                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -149,6 +180,7 @@ if is_tf_available():
    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                         TFDistilBertModel, TFDistilBertForMaskedLM,
                                         TFDistilBertForSequenceClassification,
+                                         TFDistilBertForTokenClassification,
                                         TFDistilBertForQuestionAnswering,
                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -156,6 +188,16 @@ if is_tf_available():
                                    TFCTRLLMHeadModel,
                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
+                                     TFAlbertForSequenceClassification,
+                                    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
+                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+    # Optimization
+    from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
 # TF 2.0 <=> PyTorch conversion utilities
 from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
                                        load_pytorch_checkpoint_in_tf2_model,
@@ -165,6 +207,10 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name
                                        load_tf2_weights_in_pytorch_model,
                                        load_tf2_model_in_pytorch_model)
+# Pipelines
+from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \
+    Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline
 if not is_tf_available() and not is_torch_available():
    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
                   "Models won't be available and only tokenizers, configuration"