Merge remote-tracking branch 'refs/remotes/huggingface/master'

40ed7172 · erenup · 86a63070 · 7296f101 · 40ed7172 · 40ed7172
Commit 40ed7172 authored Dec 13, 2019 by erenup
20 changed files
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer, RobertaConfig,
                                  RobertaForMultipleChoice, RobertaTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from utils_multiple_choice import (convert_examples_to_features, processors)

@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu evaluate
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
@@ -464,9 +468,17 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels,
+                                          finetuning_task=args.task_name,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """
+""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """

 from __future__ import absolute_import, division, print_function

@@ -33,17 +33,23 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
+from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
+from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
+from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer

 logger = logging.getLogger(__name__)

 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
    ())

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
 }


@@ -78,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -119,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

@@ -133,13 +140,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
@@ -148,7 +158,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id)
+                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -186,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
@@ -201,12 +215,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

+            if args.n_gpu > 1:
+                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
+
            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
@@ -420,11 +437,15 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels)
+                                          num_labels=num_labels,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config)
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool(".ckpt" in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -508,3 +529,4 @@ def main():

 if __name__ == "__main__":
    main()
+
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,17 +16,18 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""

 from __future__ import absolute_import, division, print_function
+from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
+from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate

 import argparse
 import logging
 import os
 import random
 import glob
-
+import timeit
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -42,18 +43,12 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
-
-from transformers import AdamW, WarmupLinearSchedule
-
-from utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
+                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
+                                  )

-# The follwing import is the official SQuAD evaluation script (2.0).
-# You can remove it from the dependencies if you are using this script outside of the library
-# We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features

 logger = logging.getLogger(__name__)

@@ -64,7 +59,9 @@ MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
 }

 def set_seed(args):
@@ -97,14 +94,16 @@ def train(args, train_dataset, model, tokenizer):
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
@@ -127,25 +126,31 @@ def train(args, train_dataset, model, tokenizer):
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

-    global_step = 0
+    global_step = 1
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      'start_positions': batch[3],
-                      'end_positions':   batch[4]}
+
+            inputs = {
+                'input_ids':       batch[0],
+                'attention_mask':  batch[1],
+                'start_positions': batch[3],
+                'end_positions':   batch[4]
+            }
+
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+
            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+                inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
+
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

@@ -157,20 +162,23 @@ def train(args, train_dataset, model, tokenizer):
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

+                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
@@ -179,8 +187,8 @@ def train(args, train_dataset, model, tokenizer):
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

+                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
@@ -209,124 +217,162 @@ def evaluate(args, model, tokenizer, prefix=""):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+
    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
+
    all_results = []
+    start_time = timeit.default_timer()
+
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
+
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
+            inputs = {
+                'input_ids':      batch[0],
+                'attention_mask': batch[1]
+            }
+            
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+
            example_indices = batch[3]
+            
+            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
-                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+
+            output = [to_list(output[i]) for output in outputs]
+
+            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+            # models only use two.
+            if len(output) >= 5:
+                start_logits = output[0]
+                start_top_index = output[1]
+                end_logits = output[2]
+                end_top_index = output[3]
+                cls_logits = output[4]
+
+                result = SquadResult(
+                    unique_id, start_logits, end_logits, 
+                    start_top_index=start_top_index, 
+                    end_top_index=end_top_index, 
+                    cls_logits=cls_logits
+                )
+
            else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                start_logits, end_logits = output
+                result = SquadResult(
+                    unique_id, start_logits, end_logits
+                )
+
            all_results.append(result)

+    evalTime = timeit.default_timer() - start_time
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
+
    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

+    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
-        # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
+        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        model.config.start_n_top, model.config.end_n_top,
+                        output_nbest_file, output_null_log_odds_file,
+                        start_n_top, end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
-        write_predictions(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

-    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
-    results = evaluate_on_squad(evaluate_options)
+    # Compute the F1 and exact scores.
+    results = squad_evaluate(examples, predictions)
    return results

-
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+    input_dir = args.data_dir if args.data_dir else "."
+    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+        str(args.max_seq_length))
+    )
+
+    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features_and_dataset = torch.load(cached_features_file)
+        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate)
+        logger.info("Creating features from dataset file at %s", input_dir)
+
+        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
+            try:
+                import tensorflow_datasets as tfds
+            except ImportError:
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
+
+            if args.version_2_with_negative:
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+
+            tfds_examples = tfds.load("squad")
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+        else:
+            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+
+            if evaluate:
+                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
+            else:
+                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
+
+        features, dataset = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+            return_dataset='pt'
+        )
+
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-    if evaluate:
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
-    else:
-        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
-
    if output_examples:
        return dataset, examples, features
    return dataset
@@ -336,10 +382,6 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--model_type", default=None, type=str, required=True,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
@@ -348,6 +390,15 @@ def main():
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
+    parser.add_argument("--data_dir", default=None, type=str,
+                        help="The input data dir. Should contain the .json files for the task." +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--train_file", default=None, type=str,
+                        help="The input training file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
@@ -386,7 +437,7 @@ def main():
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
+                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
@@ -470,9 +521,15 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -515,7 +572,7 @@ def main():
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir, force_download=True)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

@@ -533,7 +590,7 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
+            model = model_class.from_pretrained(checkpoint, force_download=True)
            model.to(args.device)

            # Evaluate

--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
+from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors

 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
 USE_XLA = False
 USE_AMP = False
+EPOCHS = 3
+
+TASK = "mrpc"
+
+if TASK == "sst-2":
+    TFDS_TASK = "sst2"
+elif TASK == "sts-b":
+    TFDS_TASK = "stsb"
+else: 
+    TFDS_TASK = TASK
+
+num_labels = len(glue_processors[TASK]().get_labels())
+print(num_labels)

 tf.config.optimizer.set_jit(USE_XLA)
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

-# Load tokenizer and model from pretrained model/vocabulary
+# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load('glue/mrpc', with_info=True)
+data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
 train_examples = info.splits['train'].num_examples
+
+# MNLI expects either validation_matched or validation_mismatched
 valid_examples = info.splits['validation'].num_examples

 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+
+# MNLI expects either validation_matched or validation_mismatched
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

@@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+
+if num_labels == 1:
+    loss = tf.keras.losses.MeanSquaredError()
+else:
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=opt, loss=loss, metrics=[metric])

@@ -40,24 +64,30 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric])
 train_steps = train_examples//BATCH_SIZE
 valid_steps = valid_examples//EVAL_BATCH_SIZE

-history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
                    validation_data=valid_dataset, validation_steps=valid_steps)

 # Save TF2 model
 os.makedirs('./save/', exist_ok=True)
 model.save_pretrained('./save/')

-# Load the TensorFlow model in PyTorch for inspection
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+if TASK == "mrpc":
+    # Load the TensorFlow model in PyTorch for inspection
+    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # do this in real life (you can run the inference on the TF model).
+    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+    sentence_0 = 'This research was consistent with his findings.'
+    sentence_1 = 'His findings were compatible with this research.'
+    sentence_2 = 'His findings were not compatible with this research.'
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = 'This research was consistent with his findings.'
-sentence_1 = 'His findings were compatible with this research.'
-sentence_2 = 'His findings were not compatible with this research.'
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    del inputs_1["special_tokens_mask"]
+    del inputs_2["special_tokens_mask"]

-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
+    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
+# coding=utf-8
+import datetime
+import os
+import math
+import glob
+import re
+import tensorflow as tf
+import collections
+import numpy as np
+from seqeval import metrics
+import _pickle as pickle
+from absl import logging
+from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
+from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
+from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
+from transformers import create_optimizer, GradientAccumulator
+from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+from fastprogress import master_bar, progress_bar
+from absl import flags
+from absl import app
+
+
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
+    ())
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
+}
+
+
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .conll files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "model_type", None,
+    "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+
+flags.DEFINE_string(
+    "model_name_or_path", None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+flags.DEFINE_string(
+    "labels", "",
+    "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
+
+flags.DEFINE_string(
+    "config_name", "",
+    "Pretrained config name or path if not the same as model_name")
+
+flags.DEFINE_string(
+    "tokenizer_name", "",
+    "Pretrained tokenizer name or path if not the same as model_name")
+
+flags.DEFINE_string(
+    "cache_dir", "",
+    "Where do you want to store the pre-trained models downloaded from s3")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sentence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter "
+    "will be padded.")
+
+flags.DEFINE_string(
+    "tpu", None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Total number of TPU cores to use.")
+
+flags.DEFINE_boolean(
+    "do_train", False,
+    "Whether to run training.")
+
+flags.DEFINE_boolean(
+    "do_eval", False,
+    "Whether to run eval on the dev set.")
+
+flags.DEFINE_boolean(
+    "do_predict", False,
+    "Whether to run predictions on the test set.")
+
+flags.DEFINE_boolean(
+    "evaluate_during_training", False,
+    "Whether to run evaluation during training at each logging step.")
+
+flags.DEFINE_boolean(
+    "do_lower_case", False,
+    "Set this flag if you are using an uncased model.")
+
+flags.DEFINE_integer(
+    "per_device_train_batch_size", 8,
+    "Batch size per GPU/CPU/TPU for training.")
+
+flags.DEFINE_integer(
+    "per_device_eval_batch_size", 8,
+    "Batch size per GPU/CPU/TPU for evaluation.")
+
+flags.DEFINE_integer(
+    "gradient_accumulation_steps", 1,
+    "Number of updates steps to accumulate before performing a backward/update pass.")
+
+flags.DEFINE_float(
+    "learning_rate", 5e-5,
+    "The initial learning rate for Adam.")
+
+flags.DEFINE_float(
+    "weight_decay", 0.0,
+    "Weight decay if we apply some.")
+
+flags.DEFINE_float(
+    "adam_epsilon", 1e-8,
+    "Epsilon for Adam optimizer.")
+
+flags.DEFINE_float(
+    "max_grad_norm", 1.0,
+    "Max gradient norm.")
+
+flags.DEFINE_integer(
+    "num_train_epochs", 3,
+    "Total number of training epochs to perform.")
+
+flags.DEFINE_integer(
+    "max_steps", -1,
+    "If > 0: set total number of training steps to perform. Override num_train_epochs.")
+
+flags.DEFINE_integer(
+    "warmup_steps", 0,
+    "Linear warmup over warmup_steps.")
+
+flags.DEFINE_integer(
+    "logging_steps", 50,
+    "Log every X updates steps.")
+
+flags.DEFINE_integer(
+    "save_steps", 50,
+    "Save checkpoint every X updates steps.")
+
+flags.DEFINE_boolean(
+    "eval_all_checkpoints", False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+
+flags.DEFINE_boolean(
+    "no_cuda", False,
+    "Avoid using CUDA when available")
+
+flags.DEFINE_boolean(
+    "overwrite_output_dir", False,
+    "Overwrite the content of the output directory")
+
+flags.DEFINE_boolean(
+    "overwrite_cache", False,
+    "Overwrite the cached training and evaluation sets")
+
+flags.DEFINE_integer(
+    "seed", 42,
+    "random seed for initialization")
+
+flags.DEFINE_boolean(
+    "fp16", False,
+    "Whether to use 16-bit (mixed) precision instead of 32-bit")
+
+flags.DEFINE_string(
+    "gpus", "0",
+    "Comma separated list of gpus devices. If only one, switch to single "
+    "gpu strategy, if None takes all the gpus available.")
+
+
+def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
+    if args['max_steps'] > 0:
+        num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
+        args['num_train_epochs'] = 1
+    else:
+        num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
+
+    writer = tf.summary.create_file_writer("/tmp/mylogs")
+
+    with strategy.scope():
+        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+        optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
+
+        if args['fp16']:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
+
+        loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
+        gradient_accumulator = GradientAccumulator()
+        
+    logging.info("***** Running training *****")
+    logging.info("  Num examples = %d", num_train_examples)
+    logging.info("  Num Epochs = %d", args['num_train_epochs'])
+    logging.info("  Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
+    logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                train_batch_size * args['gradient_accumulation_steps'])
+    logging.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
+    logging.info("  Total training steps = %d", num_train_steps)
+
+    model.summary()
+
+    @tf.function
+    def apply_gradients():
+        grads_and_vars = []
+
+        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
+            if gradient is not None:
+                scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
+                grads_and_vars.append((scaled_gradient, variable))
+            else:
+                grads_and_vars.append((gradient, variable))
+
+        optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
+        gradient_accumulator.reset()
+
+    @tf.function
+    def train_step(train_features, train_labels):
+        def step_fn(train_features, train_labels):
+            inputs = {'attention_mask': train_features['input_mask'], 'training': True}
+
+            if args['model_type'] != "distilbert":
+                inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+
+            with tf.GradientTape() as tape:
+                logits = model(train_features['input_ids'], **inputs)[0]
+                logits = tf.reshape(logits, (-1, len(labels) + 1))
+                active_loss = tf.reshape(train_features['input_mask'], (-1,))
+                active_logits = tf.boolean_mask(logits, active_loss)
+                train_labels = tf.reshape(train_labels, (-1,))
+                active_labels = tf.boolean_mask(train_labels, active_loss)
+                cross_entropy = loss_fct(active_labels, active_logits)
+                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
+                grads = tape.gradient(loss, model.trainable_variables)
+
+                gradient_accumulator(grads)
+
+            return cross_entropy
+
+        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
+        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
+
+        return mean_loss
+
+    current_time = datetime.datetime.now()
+    train_iterator = master_bar(range(args['num_train_epochs']))
+    global_step = 0
+    logging_loss = 0.0
+
+    for epoch in train_iterator:
+        epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
+        step = 1
+
+        with strategy.scope():
+            for train_features, train_labels in epoch_iterator:
+                loss = train_step(train_features, train_labels)
+
+                if step % args['gradient_accumulation_steps'] == 0:
+                    strategy.experimental_run_v2(apply_gradients)
+
+                    loss_metric(loss)
+
+                    global_step += 1
+
+                    if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
+                        # Log metrics
+                        if args['n_device'] == 1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                            report = metrics.classification_report(y_true, y_pred, digits=4)
+                            
+                            logging.info("Eval at step " + str(global_step) + "\n" + report)
+                            logging.info("eval_loss: " + str(eval_loss))
+                            
+                            precision = metrics.precision_score(y_true, y_pred)
+                            recall = metrics.recall_score(y_true, y_pred)
+                            f1 = metrics.f1_score(y_true, y_pred)
+
+                            with writer.as_default():
+                                tf.summary.scalar("eval_loss", eval_loss, global_step)
+                                tf.summary.scalar("precision", precision, global_step)
+                                tf.summary.scalar("recall", recall, global_step)
+                                tf.summary.scalar("f1", f1, global_step)
+                        
+                        lr = optimizer.learning_rate
+                        learning_rate = lr(step)
+
+                        with writer.as_default():
+                            tf.summary.scalar("lr", learning_rate, global_step)
+                            tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
+                        
+                        logging_loss = loss_metric.result()
+
+                    with writer.as_default():
+                        tf.summary.scalar("loss", loss_metric.result(), step=step)
+
+                    if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
+                        # Save model checkpoint
+                        output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
+
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+                        
+                        model.save_pretrained(output_dir)
+                        logging.info("Saving model checkpoint to %s", output_dir)
+                
+                train_iterator.child.comment = f'loss : {loss_metric.result()}'
+                step += 1
+
+        train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
+
+        loss_metric.reset_states()
+
+    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
+
+
+def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
+    eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
+    eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
+    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
+    preds = None
+    num_eval_steps = math.ceil(size / eval_batch_size)
+    master = master_bar(range(1))
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
+    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+    loss = 0.0
+
+    logging.info("***** Running evaluation *****")
+    logging.info("  Num examples = %d", size)
+    logging.info("  Batch size = %d", eval_batch_size)
+
+    for eval_features, eval_labels in eval_iterator:
+        inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
+
+        if args['model_type'] != "distilbert":
+            inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
+
+        with strategy.scope():
+            logits = model(eval_features['input_ids'], **inputs)[0]
+            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
+            active_loss = tf.reshape(eval_features['input_mask'], (-1,))
+            active_logits = tf.boolean_mask(tmp_logits, active_loss)
+            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
+            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
+            cross_entropy = loss_fct(active_labels, active_logits)
+            loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
+
+        if preds is None:
+            preds = logits.numpy()
+            label_ids = eval_labels.numpy()
+        else:
+            preds = np.append(preds, logits.numpy(), axis=0)
+            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
+
+    preds = np.argmax(preds, axis=2)
+    y_pred = [[] for _ in range(label_ids.shape[0])]
+    y_true = [[] for _ in range(label_ids.shape[0])]
+    loss = loss / num_eval_steps
+
+    for i in range(label_ids.shape[0]):
+        for j in range(label_ids.shape[1]):
+            if label_ids[i, j] != pad_token_label_id:
+                y_pred[i].append(labels[preds[i, j] - 1])
+                y_true[i].append(labels[label_ids[i, j] - 1])
+
+    return y_true, y_pred, loss.numpy()
+
+
+def load_cache(cached_file, max_seq_length):
+    name_to_features = {
+        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+    }
+
+    def _decode_record(record):
+        example = tf.io.parse_single_example(record, name_to_features)
+        features = {}
+        features['input_ids'] = example['input_ids']
+        features['input_mask'] = example['input_mask']
+        features['segment_ids'] = example['segment_ids']
+
+        return features, example['label_ids']
+
+    d = tf.data.TFRecordDataset(cached_file)
+    d = d.map(_decode_record, num_parallel_calls=4)
+    count = d.reduce(0, lambda x, _: x + 1)
+
+    return d, count.numpy()
+
+
+def save_cache(features, cached_features_file):
+    writer = tf.io.TFRecordWriter(cached_features_file)
+
+    for (ex_index, feature) in enumerate(features):
+        if ex_index % 5000 == 0:
+            logging.info("Writing example %d of %d" % (ex_index, len(features)))
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        record_feature = collections.OrderedDict()
+        record_feature["input_ids"] = create_int_feature(feature.input_ids)
+        record_feature["input_mask"] = create_int_feature(feature.input_mask)
+        record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
+        record_feature["label_ids"] = create_int_feature(feature.label_ids)
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
+
+        writer.write(tf_example.SerializeToString())
+
+    writer.close()
+
+
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
+    drop_remainder = True if args['tpu'] or mode == 'train' else False
+
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
+        list(filter(None, args['model_name_or_path'].split("/"))).pop(),
+        str(args['max_seq_length'])))
+    if os.path.exists(cached_features_file) and not args['overwrite_cache']:
+        logging.info("Loading features from cached file %s", cached_features_file)
+        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+    else:
+        logging.info("Creating features from dataset file at %s", args['data_dir'])
+        examples = read_examples_from_file(args['data_dir'], mode)
+        features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
+                                                cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
+                                                # xlnet has a cls token at the end
+                                                cls_token=tokenizer.cls_token,
+                                                cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
+                                                sep_token=tokenizer.sep_token,
+                                                sep_token_extra=bool(args['model_type'] in ["roberta"]),
+                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+                                                pad_on_left=bool(args['model_type'] in ["xlnet"]),
+                                                # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
+                                                pad_token_label_id=pad_token_label_id
+                                                )
+        logging.info("Saving features into cached file %s", cached_features_file)
+        save_cache(features, cached_features_file)
+        dataset, size = load_cache(cached_features_file, args['max_seq_length'])
+
+    if mode == 'train':
+        dataset = dataset.repeat()
+        dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
+
+    dataset = dataset.batch(batch_size, drop_remainder)
+    dataset = dataset.prefetch(buffer_size=batch_size)
+
+    return dataset, size
+
+
+def main(_):
+    logging.set_verbosity(logging.INFO)
+    args = flags.FLAGS.flag_values_dict()
+
+    if os.path.exists(args['output_dir']) and os.listdir(
+            args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args['output_dir']))
+
+    if args['fp16']:
+        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
+
+    if args['tpu']:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
+        tf.config.experimental_connect_to_cluster(resolver)
+        tf.tpu.experimental.initialize_tpu_system(resolver)
+        strategy = tf.distribute.experimental.TPUStrategy(resolver)
+        args['n_device'] = args['num_tpu_cores']
+    elif len(args['gpus'].split(',')) > 1:
+        args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
+    elif args['no_cuda']:
+        args['n_device'] = 1
+        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+    else:
+        args['n_device'] = len(args['gpus'].split(','))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
+
+    logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
+                   args['n_device'], bool(args['n_device'] > 1), args['fp16'])
+
+    labels = get_labels(args['labels'])
+    num_labels = len(labels) + 1
+    pad_token_label_id = 0
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
+    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
+                                          num_labels=num_labels,
+                                          cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+
+    logging.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args['do_train']:
+        tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
+                                                    do_lower_case=args['do_lower_case'],
+                                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+
+        with strategy.scope():
+            model = model_class.from_pretrained(args['model_name_or_path'],
+                                                from_pt=bool(".bin" in args['model_name_or_path']),
+                                                config=config,
+                                                cache_dir=args['cache_dir'] if args['cache_dir'] else None)
+            model.layers[-1].activation = tf.keras.activations.softmax
+
+        train_batch_size = args['per_device_train_batch_size'] * args['n_device']
+        train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
+        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+        train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
+
+        if not os.path.exists(args['output_dir']):
+            os.makedirs(args['output_dir'])
+
+        logging.info("Saving model to %s", args['output_dir'])
+
+        model.save_pretrained(args['output_dir'])
+        tokenizer.save_pretrained(args['output_dir'])
+
+    # Evaluation
+    if args['do_eval']:
+        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+        checkpoints = []
+        results = []
+
+        if args['eval_all_checkpoints']:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
+        
+        logging.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        if len(checkpoints) == 0:
+            checkpoints.append(args['output_dir'])
+        
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
+
+            with strategy.scope():
+                model = model_class.from_pretrained(checkpoint)
+
+            y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+
+            if global_step:
+                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
+
+        output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
+        
+        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
+            for res in results:
+                for key, val in res.items():
+                    if "loss" in key:
+                        logging.info(key + " = " + str(val))
+                        writer.write(key + " = " + str(val))
+                        writer.write("\n")
+                    else:
+                        logging.info(key)
+                        logging.info("\n" + report)
+                        writer.write(key + "\n")
+                        writer.write(report)
+                        writer.write("\n")
+
+    if args['do_predict']:
+        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
+        model = model_class.from_pretrained(args['output_dir'])
+        eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
+        predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
+        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
+        output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
+        output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
+        report = metrics.classification_report(y_true, y_pred, digits=4)
+
+        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+            
+            logging.info("\n" + report)
+            
+            writer.write(report)
+            writer.write("\n\nloss = " + str(pred_loss))
+
+        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
+            with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
+                example_id = 0
+
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+
+                        if not y_pred[example_id]:
+                            example_id += 1
+                    elif y_pred[example_id]:
+                        output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("output_dir")
+    flags.mark_flag_as_required("model_name_or_path")
+    flags.mark_flag_as_required("model_type")
+    app.run(main)
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
+    Adapted from `examples/run_glue.py`"""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
+from tqdm import tqdm, trange
+
+from transformers import (WEIGHTS_NAME, 
+                          BertConfig, BertForSequenceClassification, BertTokenizer,
+                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
+                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+
+from transformers import AdamW, get_linear_schedule_with_warmup
+
+from transformers import xnli_compute_metrics as compute_metrics
+from transformers import xnli_output_modes as output_modes
+from transformers import xnli_processors as processors
+
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'labels':         batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    eval_task_names = (args.task_name,)
+    eval_outputs_dirs = (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {'input_ids':      batch[0],
+                          'attention_mask': batch[1],
+                          'labels':         batch[3]}
+                if args.model_type != 'distilbert':
+                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None  # XLM and DistilBERT don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        else:
+            raise ValueError('No other `output_mode` for XNLI.')
+        result = compute_metrics(eval_task, preds, out_label_ids)
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    processor = processors[task](language=args.language, train_language=args.train_language)
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
+        'test' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task),
+        str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                label_list=label_list,
+                                                max_length=args.max_seq_length,
+                                                output_mode=output_mode,
+                                                pad_on_left=False,
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=0,
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    else:
+        raise ValueError('No other `output_mode` for XNLI.')
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--language", default=None, type=str, required=True,
+                        help="Evaluation language. Also train language if `train_language` is set to None.")
+    parser.add_argument("--train_language", default=None, type=str,
+                        help="Train language if is different of the evaluation language.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the test set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare XNLI task
+    args.task_name = 'xnli'
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name](language=args.language, train_language=args.train_language)
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels,
+                                          finetuning_task=args.task_name,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
+# Text Summarization with Pretrained Encoders
+
+This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
+
+The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
+
+The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
+
+## Setup
+
+```
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install [--editable] .
+pip install nltk py-rouge
+cd examples/summarization
+```
+
+## Reproduce the authors' results on ROUGE
+
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+
+And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --to_cpu false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+    --compute_rouge true
+```
+
+The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+
+## Summarize any text
+
+Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --to_cpu false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+```
+
+You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
+# coding=utf-8
+# Copyright 2019 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BertAbs configuration """
+import json
+import logging
+import sys
+
+from transformers import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+BERTABS_FINETUNED_CONFIG_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
+}
+
+
+class BertAbsConfig(PretrainedConfig):
+    r""" Class to store the configuration of the BertAbs model.
+
+    Arguments:
+        max_pos: int
+            The maximum sequence length that this model will be used with.
+        enc_layer: int
+            The numner of hidden layers in the Transformer encoder.
+        enc_hidden_size: int
+            The size of the encoder's layers.
+        enc_heads: int
+            The number of attention heads for each attention layer in the encoder.
+        enc_ff_size: int
+            The size of the encoder's feed-forward layers.
+        enc_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the encoder.
+        dec_layer: int
+            The numner of hidden layers in the decoder.
+        dec_hidden_size: int
+            The size of the decoder's layers.
+        dec_heads: int
+            The number of attention heads for each attention layer in the decoder.
+        dec_ff_size: int
+            The size of the decoder's feed-forward layers.
+        dec_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the decoder.
+    """
+
+    pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=30522,
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+        **kwargs,
+    ):
+        super(BertAbsConfig, self).__init__(**kwargs)
+
+        if self._input_is_path_to_json(vocab_size_or_config_json_file):
+            path_to_json = vocab_size_or_config_json_file
+            with open(path_to_json, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_pos = max_pos
+
+            self.enc_layers = enc_layers
+            self.enc_hidden_size = enc_hidden_size
+            self.enc_heads = enc_heads
+            self.enc_ff_size = enc_ff_size
+            self.enc_dropout = enc_dropout
+
+            self.dec_layers = dec_layers
+            self.dec_hidden_size = dec_hidden_size
+            self.dec_heads = dec_heads
+            self.dec_ff_size = dec_ff_size
+            self.dec_dropout = dec_dropout
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    def _input_is_path_to_json(self, first_argument):
+        """ Checks whether the first argument passed to config
+        is the path to a JSON file that contains the config.
+        """
+        is_python_2 = sys.version_info[0] == 2
+        if is_python_2:
+            return isinstance(first_argument, unicode)
+        else:
+            return isinstance(first_argument, str)
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert BertExtAbs's checkpoints.
+
+The script looks like it is doing something trivial but it is not. The "weights"
+proposed by the authors are actually the entire model pickled. We need to load
+the model within the original codebase to be able to only save its `state_dict`.
+"""
+
+import argparse
+from collections import namedtuple
+import logging
+import torch
+
+from models.model_builder import AbsSummarizer  # The authors' implementation
+from model_bertabs import BertAbsSummarizer
+
+from transformers import BertTokenizer
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+BertAbsConfig = namedtuple(
+    "BertAbsConfig",
+    ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
+)
+
+
+def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
+    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    of BertAbs for the internal architecture.
+    """
+
+    # Instantiate the authors' model with the pre-trained weights
+    config = BertAbsConfig(
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        use_bert_emb=False,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+    )
+    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
+    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
+    original.eval()
+
+    new_model = BertAbsSummarizer(config, torch.device("cpu"))
+    new_model.eval()
+
+    # -------------------
+    # Convert the weights
+    # -------------------
+
+    logging.info("convert the model")
+    new_model.bert.load_state_dict(original.bert.state_dict())
+    new_model.decoder.load_state_dict(original.decoder.state_dict())
+    new_model.generator.load_state_dict(original.generator.state_dict())
+
+    # ----------------------------------
+    # Make sure the outpus are identical
+    # ----------------------------------
+
+    logging.info("Make sure that the models' outputs are identical")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # prepare the model inputs
+    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
+    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
+    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
+    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
+    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
+    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
+
+    # failsafe to make sure the weights reset does not affect the
+    # loaded weights.
+    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
+
+    # forward pass
+    src = encoder_input_ids
+    tgt = decoder_input_ids
+    segs = token_type_ids = None
+    clss = None
+    mask_src = encoder_attention_mask = None
+    mask_tgt = decoder_attention_mask = None
+    mask_cls = None
+
+    # The original model does not apply the geneator layer immediatly but rather in
+    # the beam search (where it combines softmax + linear layer). Since we already
+    # apply the softmax in our generation process we only apply the linear layer here.
+    # We make sure that the outputs of the full stack are identical
+    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
+    output_original_generator = original.generator(output_original_model)
+
+    output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
+    output_converted_generator = new_model.generator(output_converted_model)
+
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+
+    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
+    if are_identical:
+        logging.info("all weights are equal up to 1e-3")
+    else:
+        raise ValueError("the weights are different. The new model is likely different from the original one.")
+
+    # The model has been saved with torch.save(model) and this is bound to the exact
+    # directory structure. We save the state_dict instead.
+    logging.info("saving the model's state dictionary")
+    torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+
+    convert_bertabs_checkpoints(
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
+    )
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
+# MIT License
+
+# Copyright (c) 2019 Yang Liu and the HuggingFace team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import copy
+import math
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn.init import xavier_uniform_
+
+from transformers import BertModel, BertConfig, PreTrainedModel
+
+from configuration_bertabs import BertAbsConfig
+
+
+MAX_SIZE = 5000
+
+BERTABS_FINETUNED_MODEL_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin",
+}
+
+
+class BertAbsPreTrainedModel(PreTrainedModel):
+    config_class = BertAbsConfig
+    pretrained_model_archive_map = BERTABS_FINETUNED_MODEL_MAP
+    load_tf_weights = False
+    base_model_prefix = "bert"
+
+
+class BertAbs(BertAbsPreTrainedModel):
+    def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
+        super(BertAbs, self).__init__(args)
+        self.args = args
+        self.bert = Bert()
+
+        # If pre-trained weights are passed for Bert, load these.
+        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
+        if load_bert_pretrained_extractive:
+            self.bert.model.load_state_dict(
+                dict(
+                    [
+                        (n[11:], p)
+                        for n, p in bert_extractive_checkpoint.items()
+                        if n.startswith("bert.model")
+                    ]
+                ),
+                strict=True,
+            )
+
+        self.vocab_size = self.bert.model.config.vocab_size
+
+        if args.max_pos > 512:
+            my_pos_embeddings = nn.Embedding(
+                args.max_pos, self.bert.model.config.hidden_size
+            )
+            my_pos_embeddings.weight.data[
+                :512
+            ] = self.bert.model.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[
+                512:
+            ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
+                None, :
+            ].repeat(
+                args.max_pos - 512, 1
+            )
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
+        )
+
+        tgt_embeddings.weight = copy.deepcopy(
+            self.bert.model.embeddings.word_embeddings.weight
+        )
+
+        self.decoder = TransformerDecoder(
+            self.args.dec_layers,
+            self.args.dec_hidden_size,
+            heads=self.args.dec_heads,
+            d_ff=self.args.dec_ff_size,
+            dropout=self.args.dec_dropout,
+            embeddings=tgt_embeddings,
+            vocab_size=self.vocab_size,
+        )
+
+        gen_func = nn.LogSoftmax(dim=-1)
+        self.generator = nn.Sequential(
+            nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
+        )
+        self.generator[0].weight = self.decoder.embeddings.weight
+
+        load_from_checkpoints = False if checkpoint is None else True
+        if load_from_checkpoints:
+            self.load_state_dict(checkpoint)
+
+    def init_weights(self):
+        for module in self.decoder.modules():
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        for p in self.generator.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                p.data.zero_()
+
+    def forward(
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
+    ):
+        encoder_output = self.bert(
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
+        )
+        encoder_hidden_states = encoder_output[0]
+        dec_state = self.decoder.init_decoder_state(
+            encoder_input_ids, encoder_hidden_states
+        )
+        decoder_outputs, _ = self.decoder(
+            decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
+        )
+        return decoder_outputs
+
+
+class Bert(nn.Module):
+    """ This class is not really necessary and should probably disappear.
+    """
+
+    def __init__(self):
+        super(Bert, self).__init__()
+        config = BertConfig.from_pretrained("bert-base-uncased")
+        self.model = BertModel(config)
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
+        self.eval()
+        with torch.no_grad():
+            encoder_outputs, _ = self.model(
+                input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                **kwargs
+            )
+        return encoder_outputs
+
+
+class TransformerDecoder(nn.Module):
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
+        super(TransformerDecoder, self).__init__()
+
+        # Basic attributes.
+        self.decoder_type = "transformer"
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+    # forward(input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask)
+    # def forward(self, input_ids, state, attention_mask=None, memory_lengths=None,
+    # step=None, cache=None, encoder_attention_mask=None, encoder_hidden_states=None, memory_masks=None):
+    def forward(
+        self,
+        input_ids,
+        encoder_hidden_states=None,
+        state=None,
+        attention_mask=None,
+        memory_lengths=None,
+        step=None,
+        cache=None,
+        encoder_attention_mask=None,
+    ):
+        """
+        See :obj:`onmt.modules.RNNDecoderBase.forward()`
+        memory_bank = encoder_hidden_states
+        """
+        # Name conversion
+        tgt = input_ids
+        memory_bank = encoder_hidden_states
+        memory_mask = encoder_attention_mask
+
+        # src_words = state.src
+        src_words = state.src
+        src_batch, src_len = src_words.size()
+
+        padding_idx = self.embeddings.padding_idx
+
+        # Decoder padding mask
+        tgt_words = tgt
+        tgt_batch, tgt_len = tgt_words.size()
+        tgt_pad_mask = (
+            tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
+        )
+
+        # Encoder padding mask
+        if memory_mask is not None:
+            src_len = memory_mask.size(-1)
+            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = (
+                src_words.data.eq(padding_idx)
+                .unsqueeze(1)
+                .expand(src_batch, tgt_len, src_len)
+            )
+
+        # Pass through the embeddings
+        emb = self.embeddings(input_ids)
+        output = self.pos_emb(emb, step)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+
+        if state.cache is None:
+            saved_inputs = []
+
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if state.cache is None:
+                if state.previous_input is not None:
+                    prev_layer_input = state.previous_layer_inputs[i]
+
+            output, all_input = self.transformer_layers[i](
+                output,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=prev_layer_input,
+                layer_cache=state.cache["layer_{}".format(i)]
+                if state.cache is not None
+                else None,
+                step=step,
+            )
+            if state.cache is None:
+                saved_inputs.append(all_input)
+
+        if state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        if state.cache is None:
+            state = state.update_state(tgt, saved_inputs)
+
+        # Decoders in transformers return a tuple. Beam search will fail
+        # if we don't follow this convention.
+        return output, state  # , state
+
+    def init_decoder_state(self, src, memory_bank, with_cache=False):
+        """ Init decoder state """
+        state = TransformerDecoderState(src)
+        if with_cache:
+            state._init_cache(memory_bank, self.num_layers)
+        return state
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, dropout, dim, max_len=5000):
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
+        )
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer("pe", pe)
+        self.dropout = nn.Dropout(p=dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if step:
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, : emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, : emb.size(1)]
+
+
+class TransformerDecoderLayer(nn.Module):
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer("mask", mask)
+
+    def forward(
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
+    ):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
+        )
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
+        )
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, all_input
+        # return output
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super(MultiHeadedAttention, self).__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if self.use_final_linear:
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
+    ):
+        """
+        Compute the context vector and the attention vectors.
+
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+        key_len = key.size(1)
+        query_len = query.size(1)
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return (
+                x.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, -1, head_count * dim_per_head)
+            )
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == "self":
+                query, key, value = (
+                    self.linear_query(query),
+                    self.linear_keys(query),
+                    self.linear_values(query),
+                )
+
+                key = shape(key)
+                value = shape(value)
+
+                if layer_cache is not None:
+                    device = key.device
+                    if layer_cache["self_keys"] is not None:
+                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
+                    if layer_cache["self_values"] is not None:
+                        value = torch.cat(
+                            (layer_cache["self_values"].to(device), value), dim=2
+                        )
+                    layer_cache["self_keys"] = key
+                    layer_cache["self_values"] = value
+            elif type == "context":
+                query = self.linear_query(query)
+                if layer_cache is not None:
+                    if layer_cache["memory_keys"] is None:
+                        key, value = self.linear_keys(key), self.linear_values(value)
+                        key = shape(key)
+                        value = shape(value)
+                    else:
+                        key, value = (
+                            layer_cache["memory_keys"],
+                            layer_cache["memory_values"],
+                        )
+                    layer_cache["memory_keys"] = key
+                    layer_cache["memory_values"] = value
+                else:
+                    key, value = self.linear_keys(key), self.linear_values(value)
+                    key = shape(key)
+                    value = shape(value)
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        key_len = key.size(2)
+        query_len = query.size(2)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if not predefined_graph_1 is None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            return context
+
+
+class DecoderState(object):
+    """Interface for grouping together the current state of a recurrent
+    decoder. In the simplest case just represents the hidden state of
+    the model.  But can also be used for implementing various forms of
+    input_feeding and non-recurrent models.
+
+    Modules need to implement this to utilize beam search decoding.
+    """
+
+    def detach(self):
+        """ Need to document this """
+        self.hidden = tuple([_.detach() for _ in self.hidden])
+        self.input_feed = self.input_feed.detach()
+
+    def beam_update(self, idx, positions, beam_size):
+        """ Need to document this """
+        for e in self._all:
+            sizes = e.size()
+            br = sizes[1]
+            if len(sizes) == 3:
+                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
+                    :, :, idx
+                ]
+            else:
+                sent_states = e.view(
+                    sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
+                )[:, :, idx]
+
+            sent_states.data.copy_(sent_states.data.index_select(1, positions))
+
+    def map_batch_fn(self, fn):
+        raise NotImplementedError()
+
+
+class TransformerDecoderState(DecoderState):
+    """ Transformer Decoder state base class """
+
+    def __init__(self, src):
+        """
+        Args:
+            src (FloatTensor): a sequence of source words tensors
+                    with optional feature tensors, of size (len x batch).
+        """
+        self.src = src
+        self.previous_input = None
+        self.previous_layer_inputs = None
+        self.cache = None
+
+    @property
+    def _all(self):
+        """
+        Contains attributes that need to be updated in self.beam_update().
+        """
+        if self.previous_input is not None and self.previous_layer_inputs is not None:
+            return (self.previous_input, self.previous_layer_inputs, self.src)
+        else:
+            return (self.src,)
+
+    def detach(self):
+        if self.previous_input is not None:
+            self.previous_input = self.previous_input.detach()
+        if self.previous_layer_inputs is not None:
+            self.previous_layer_inputs = self.previous_layer_inputs.detach()
+        self.src = self.src.detach()
+
+    def update_state(self, new_input, previous_layer_inputs):
+        state = TransformerDecoderState(self.src)
+        state.previous_input = new_input
+        state.previous_layer_inputs = previous_layer_inputs
+        return state
+
+    def _init_cache(self, memory_bank, num_layers):
+        self.cache = {}
+
+        for l in range(num_layers):
+            layer_cache = {"memory_keys": None, "memory_values": None}
+            layer_cache["self_keys"] = None
+            layer_cache["self_values"] = None
+            self.cache["layer_{}".format(l)] = layer_cache
+
+    def repeat_beam_size_times(self, beam_size):
+        """ Repeat beam_size times along batch dimension. """
+        self.src = self.src.data.repeat(1, beam_size, 1)
+
+    def map_batch_fn(self, fn):
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+
+        self.src = fn(self.src, 0)
+        if self.cache is not None:
+            _recursive_map(self.cache)
+
+
+def gelu(x):
+    return (
+        0.5
+        * x
+        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    )
+
+
+class PositionwiseFeedForward(nn.Module):
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.actv = gelu
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+#
+# TRANSLATOR
+# The following code is used to generate summaries using the
+# pre-trained weights and beam search.
+#
+
+
+def build_predictor(args, tokenizer, symbols, model, logger=None):
+    # we should be able to refactor the global scorer a lot
+    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
+    translator = Translator(
+        args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
+    )
+    return translator
+
+
+class GNMTGlobalScorer(object):
+    """
+    NMT re-ranking score from
+    "Google's Neural Machine Translation System" :cite:`wu2016google`
+
+    Args:
+       alpha (float): length parameter
+       beta (float):  coverage parameter
+    """
+
+    def __init__(self, alpha, length_penalty):
+        self.alpha = alpha
+        penalty_builder = PenaltyBuilder(length_penalty)
+        self.length_penalty = penalty_builder.length_penalty()
+
+    def score(self, beam, logprobs):
+        """
+        Rescores a prediction based on penalty functions
+        """
+        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
+        return normalized_probs
+
+
+class PenaltyBuilder(object):
+    """
+    Returns the Length and Coverage Penalty function for Beam Search.
+
+    Args:
+        length_pen (str): option name of length pen
+        cov_pen (str): option name of cov pen
+    """
+
+    def __init__(self, length_pen):
+        self.length_pen = length_pen
+
+    def length_penalty(self):
+        if self.length_pen == "wu":
+            return self.length_wu
+        elif self.length_pen == "avg":
+            return self.length_average
+        else:
+            return self.length_none
+
+    """
+    Below are all the different penalty terms implemented so far
+    """
+
+    def length_wu(self, beam, logprobs, alpha=0.0):
+        """
+        NMT length re-ranking score from
+        "Google's Neural Machine Translation System" :cite:`wu2016google`.
+        """
+
+        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)
+        return logprobs / modifier
+
+    def length_average(self, beam, logprobs, alpha=0.0):
+        """
+        Returns the average probability of tokens in a sequence.
+        """
+        return logprobs / len(beam.next_ys)
+
+    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
+        """
+        Returns unmodified scores.
+        """
+        return logprobs
+
+
+class Translator(object):
+    """
+    Uses a model to translate a batch of sentences.
+
+    Args:
+       model (:obj:`onmt.modules.NMTModel`):
+          NMT model to use for translation
+       fields (dict of Fields): data fields
+       beam_size (int): size of beam to use
+       n_best (int): number of translations produced
+       max_length (int): maximum length output to produce
+       global_scores (:obj:`GlobalScorer`):
+         object to rescore final translations
+       copy_attn (bool): use copy attention during translation
+       beam_trace (bool): trace beam search for debugging
+       logger(logging.Logger): logger.
+    """
+
+    def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
+        self.logger = logger
+
+        self.args = args
+        self.model = model
+        self.generator = self.model.generator
+        self.vocab = vocab
+        self.symbols = symbols
+        self.start_token = symbols["BOS"]
+        self.end_token = symbols["EOS"]
+
+        self.global_scorer = global_scorer
+        self.beam_size = args.beam_size
+        self.min_length = args.min_length
+        self.max_length = args.max_length
+
+    def translate(self, batch, step, attn_debug=False):
+        """ Generates summaries from one batch of data.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+        return translations
+
+    def translate_batch(self, batch, fast=False):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length
+            )
+
+    # Where the beam search lives
+    # I have no idea why it is being called from the method above
+    def _fast_translate_batch(self, batch, max_length, min_length=0):
+        """ Beam Search using the encoder inputs contained in `batch`.
+        """
+
+        # The batch object is funny
+        # Instead of just looking at the size of the arguments we encapsulate
+        # a size argument.
+        # Where is it defined?
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        segs = batch.segs
+        mask_src = batch.mask_src
+
+        src_features = self.model.bert(src, segs, mask_src)
+        dec_states = self.model.decoder.init_decoder_state(
+            src, src_features, with_cache=True
+        )
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
+        src_features = tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
+        )
+        alive_seq = torch.full(
+            [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
+        )
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = torch.tensor(
+            [0.0] + [float("-inf")] * (beam_size - 1), device=device
+        ).repeat(batch_size)
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results["predictions"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["scores"] = [[] for _ in range(batch_size)]  # noqa: F812
+        results["gold_score"] = [0] * batch_size
+        results["batch"] = batch
+
+        for step in range(max_length):
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+
+            dec_out, dec_states = self.model.decoder(
+                decoder_input, src_features, dec_states, step=step
+            )
+
+            # Generator forward.
+            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # Multiply probs by the beam probability.
+            log_probs += topk_log_probs.view(-1).unsqueeze(1)
+
+            alpha = self.global_scorer.alpha
+            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha
+
+            # Flatten probs into a list of possibilities.
+            curr_scores = log_probs / length_penalty
+
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        words = [self.vocab.ids_to_tokens[w] for w in words]
+                        words = " ".join(words).replace(" ##", "").split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [
+                            (words[i - 1], words[i], words[i + 1])
+                            for i in range(1, len(words) - 1)
+                        ]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+
+            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+
+            # Resolve beam origin and true word ids.
+            topk_beam_index = topk_ids.div(vocab_size)
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = topk_beam_index + beam_offset[
+                : topk_beam_index.size(0)
+            ].unsqueeze(1)
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat(
+                [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
+            )
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(1)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(1)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True)
+                        score, pred = best_hyp[0]
+
+                        results["scores"][b].append(score)
+                        results["predictions"][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished).view(
+                    -1, alive_seq.size(-1)
+                )
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            dec_states.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices)
+            )
+
+        return results
+
+    def from_batch(self, translation_batch):
+        batch = translation_batch["batch"]
+        assert len(translation_batch["gold_score"]) == len(translation_batch["predictions"])
+        batch_size = batch.batch_size
+
+        preds, _, _, tgt_str, src = (
+            translation_batch["predictions"],
+            translation_batch["scores"],
+            translation_batch["gold_score"],
+            batch.tgt_str,
+            batch.src,
+        )
+
+        translations = []
+        for b in range(batch_size):
+            pred_sents = self.vocab.convert_ids_to_tokens([int(n) for n in preds[b][0]])
+            pred_sents = " ".join(pred_sents).replace(" ##", "")
+            gold_sent = " ".join(tgt_str[b].split())
+            raw_src = [self.vocab.ids_to_tokens[int(t)] for t in src[b]][:500]
+            raw_src = " ".join(raw_src)
+            translation = (pred_sents, gold_sent, raw_src)
+            translations.append(translation)
+
+        return translations
+
+
+def tile(x, count, dim=0):
+    """
+    Tiles x on dimension dim count times.
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = (
+        x.view(batch, -1)
+        .transpose(0, 1)
+        .repeat(count, 1)
+        .transpose(0, 1)
+        .contiguous()
+        .view(*out_size)
+    )
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
+
+
+#
+# Optimizer for training. We keep this here in case we want to add
+# a finetuning script.
+#
+
+class BertSumOptimizer(object):
+    """ Specific optimizer for BertSum.
+
+    As described in [1], the authors fine-tune BertSum for abstractive
+    summarization using two Adam Optimizers with different warm-up steps and
+    learning rate. They also use a custom learning rate scheduler.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    """
+
+    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
+        self.encoder = model.encoder
+        self.decoder = model.decoder
+        self.lr = lr
+        self.warmup_steps = warmup_steps
+
+        self.optimizers = {
+            "encoder": torch.optim.Adam(
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+            "decoder": torch.optim.Adam(
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+        }
+
+        self._step = 0
+        self.current_learning_rates = {}
+
+    def _update_rate(self, stack):
+        return self.lr[stack] * min(
+            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
+        )
+
+    def zero_grad(self):
+        self.optimizer_decoder.zero_grad()
+        self.optimizer_encoder.zero_grad()
+
+    def step(self):
+        self._step += 1
+        for stack, optimizer in self.optimizers.items():
+            new_rate = self._update_rate(stack)
+            for param_group in optimizer.param_groups:
+                param_group["lr"] = new_rate
+            optimizer.step()
+            self.current_learning_rates[stack] = new_rate
--- a/examples/summarization/requirements.txt
+++ b/examples/summarization/requirements.txt
+# progress bars in model download and training scripts
+tqdm
+# Accessing files from S3 directly.
+boto3
+# Used for downloading models over HTTP
+requests
+# For ROUGE
+nltk
+py-rouge
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
+#! /usr/bin/python3
+import argparse
+from collections import namedtuple
+import logging
+import os
+import sys
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from tqdm import tqdm
+
+from transformers import BertTokenizer
+
+from modeling_bertabs import BertAbs, build_predictor
+
+from utils_summarization import (
+    SummarizationDataset,
+    encode_for_summarization,
+    build_mask,
+    fit_to_block_size,
+    compute_token_type_ids,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+Batch = namedtuple(
+    "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
+)
+
+
+def evaluate(args):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
+    model.to(args.device)
+    model.eval()
+
+    symbols = {
+        "BOS": tokenizer.vocab["[unused0]"],
+        "EOS": tokenizer.vocab["[unused1]"],
+        "PAD": tokenizer.vocab["[PAD]"],
+    }
+
+    if args.compute_rouge:
+        reference_summaries = []
+        generated_summaries = []
+
+        import rouge
+        import nltk
+        nltk.download('punkt')
+        rouge_evaluator = rouge.Rouge(
+            metrics=['rouge-n', 'rouge-l'],
+            max_n=2,
+            limit_length=True,
+            length_limit=args.beam_size,
+            length_limit_type='words',
+            apply_avg=True,
+            apply_best=False,
+            alpha=0.5,  # Default F1_score
+            weight_factor=1.2,
+            stemming=True,
+        )
+
+    # these (unused) arguments are defined to keep the compatibility
+    # with the legacy code and will be deleted in a next iteration.
+    args.result_path = ""
+    args.temp_dir = ""
+
+    data_iterator = build_data_iterator(args, tokenizer)
+    predictor = build_predictor(args, tokenizer, symbols, model)
+
+    logger.info("***** Running evaluation *****")
+    logger.info("  Number examples = %d", len(data_iterator.dataset))
+    logger.info("  Batch size = %d", args.batch_size)
+    logger.info("")
+    logger.info("***** Beam Search parameters *****")
+    logger.info("  Beam size = %d", args.beam_size)
+    logger.info("  Minimum length = %d", args.min_length)
+    logger.info("  Maximum length = %d", args.max_length)
+    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
+    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
+
+    for batch in tqdm(data_iterator):
+        batch_data = predictor.translate_batch(batch)
+        translations = predictor.from_batch(batch_data)
+        summaries = [format_summary(t) for t in translations]
+        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
+
+        if args.compute_rouge:
+            reference_summaries += batch.tgt_str
+            generated_summaries += summaries
+
+    if args.compute_rouge:
+        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
+        str_scores = format_rouge_scores(scores)
+        save_rouge_scores(str_scores)
+        print(str_scores)
+
+
+def save_summaries(summaries, path, original_document_name):
+    """ Write the summaries in fies that are prefixed by the original
+    files' name with the `_summary` appended.
+
+    Attributes:
+        original_document_names: List[string]
+            Name of the document that was summarized.
+        path: string
+            Path were the summaries will be written
+        summaries: List[string]
+            The summaries that we produced.
+    """
+    for summary, document_name in zip(summaries, original_document_name):
+        # Prepare the summary file's name
+        if "." in document_name:
+            bare_document_name = ".".join(document_name.split(".")[:-1])
+            extension = document_name.split(".")[-1]
+            name = bare_document_name + "_summary." + extension
+        else:
+            name = document_name + "_summary"
+
+        file_path = os.path.join(path, name)
+        with open(file_path, "w") as output:
+            output.write(summary)
+
+
+def format_summary(translation):
+    """ Transforms the output of the `from_batch` function
+    into nicely formatted summaries.
+    """
+    raw_summary, _, _ = translation
+    summary = (
+        raw_summary.replace("[unused0]", "")
+        .replace("[unused3]", "")
+        .replace("[PAD]", "")
+        .replace("[unused1]", "")
+        .replace(r" +", " ")
+        .replace(" [unused2] ", ". ")
+        .replace("[unused2]", "")
+        .strip()
+    )
+
+    return summary
+
+
+def format_rouge_scores(scores):
+    return """\n
+****** ROUGE SCORES ******
+
+** ROUGE 1
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE 2
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE L
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}""".format(
+        scores['rouge-1']['f'],
+        scores['rouge-1']['p'],
+        scores['rouge-1']['r'],
+        scores['rouge-2']['f'],
+        scores['rouge-2']['p'],
+        scores['rouge-2']['r'],
+        scores['rouge-l']['f'],
+        scores['rouge-l']['p'],
+        scores['rouge-l']['r'],
+    )
+
+
+def save_rouge_scores(str_scores):
+    with open("rouge_scores.txt", "w") as output:
+        output.write(str_scores)
+
+
+#
+# LOAD the dataset
+#
+
+
+def build_data_iterator(args, tokenizer):
+    dataset = load_and_cache_examples(args, tokenizer)
+    sampler = SequentialSampler(dataset)
+    collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
+    iterator = DataLoader(
+        dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
+    )
+
+    return iterator
+
+
+def load_and_cache_examples(args, tokenizer):
+    dataset = SummarizationDataset(args.documents_dir)
+    return dataset
+
+
+def collate(data, tokenizer, block_size, device):
+    """ Collate formats the data passed to the data loader.
+
+    In particular we tokenize the data batch after batch to avoid keeping them
+    all in memory. We output the data as a namedtuple to fit the original BertAbs's
+    API.
+    """
+    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
+    names = [name for name, _, _ in data]
+    summaries = [" ".join(summary_list) for _, _, summary_list in data]
+
+    encoded_text = [
+        encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
+    ]
+    encoded_stories = torch.tensor(
+        [
+            fit_to_block_size(story, block_size, tokenizer.pad_token_id)
+            for story, _ in encoded_text
+        ]
+    )
+    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
+
+    batch = Batch(
+        document_names=names,
+        batch_size=len(encoded_stories),
+        src=encoded_stories.to(device),
+        segs=encoder_token_type_ids.to(device),
+        mask_src=encoder_mask.to(device),
+        tgt_str=summaries,
+    )
+
+    return batch
+
+
+def decode_summary(summary_tokens, tokenizer):
+    """ Decode the summary and return it in a format
+    suitable for evaluation.
+    """
+    summary_tokens = summary_tokens.to("cpu").numpy()
+    summary = tokenizer.decode(summary_tokens)
+    sentences = summary.split(".")
+    sentences = [s + "." for s in sentences]
+    return sentences
+
+
+def main():
+    """ The main function defines the interface with the users.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--documents_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The folder where the documents to summarize are located.",
+    )
+    parser.add_argument(
+        "--summaries_output_dir",
+        default=None,
+        type=str,
+        required=False,
+        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
+    )
+    parser.add_argument(
+        "--compute_rouge",
+        default=False,
+        type=bool,
+        required=False,
+        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
+    )
+    # EVALUATION options
+    parser.add_argument(
+        "--no_cuda",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
+    )
+    parser.add_argument(
+        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    # BEAM SEARCH arguments
+    parser.add_argument(
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
+    )
+    parser.add_argument(
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
+    )
+    parser.add_argument(
+        "--block_trigram",
+        default=True,
+        type=bool,
+        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
+    )
+    args = parser.parse_args()
+
+    # Select device (distibuted not available)
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+
+    # Check the existence of directories
+    if not args.summaries_output_dir:
+        args.summaries_output_dir = args.documents_dir
+
+    if not documents_dir_is_valid(args.documents_dir):
+        raise FileNotFoundError(
+            "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
+        )
+    os.makedirs(args.summaries_output_dir, exist_ok=True)
+
+    evaluate(args)
+
+
+def documents_dir_is_valid(path):
+    if not os.path.exists(path):
+        return False
+
+    file_list = os.listdir(path)
+    if len(file_list) == 0:
+        return False
+
+    return True
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
+from collections import deque
+import os
+
+import torch
+from torch.utils.data import Dataset
+
+
+# ------------
+# Data loading
+# ------------
+
+
+class SummarizationDataset(Dataset):
+    """ Abstracts the dataset used to train seq2seq models.
+
+    The class will process the documents that are located in the specified
+    folder. The preprocessing will work on any document that is reasonably
+    formatted. On the CNN/DailyMail dataset it will extract both the story
+    and the summary.
+
+    CNN/Daily News:
+
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
+    stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process
+    the data, untar both datasets in the same folder, and pass the path to this
+    folder as the "data_dir argument. The formatting code was inspired by [2].
+
+    [1] https://cs.nyu.edu/~kcho/
+    [2] https://github.com/abisee/cnn-dailymail/
+    """
+
+    def __init__(self, path="", prefix="train"):
+        """ We initialize the class by listing all the documents to summarize.
+        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
+        """
+        assert os.path.isdir(path)
+
+        self.documents = []
+        story_filenames_list = os.listdir(path)
+        for story_filename in story_filenames_list:
+            if "summary" in story_filename:
+                continue
+            path_to_story = os.path.join(path, story_filename)
+            if not os.path.isfile(path_to_story):
+                continue
+            self.documents.append(path_to_story)
+
+    def __len__(self):
+        """ Returns the number of documents. """
+        return len(self.documents)
+
+    def __getitem__(self, idx):
+        document_path = self.documents[idx]
+        document_name = document_path.split("/")[-1]
+        with open(document_path, encoding="utf-8") as source:
+            raw_story = source.read()
+            story_lines, summary_lines = process_story(raw_story)
+        return document_name, story_lines, summary_lines
+
+
+def process_story(raw_story):
+    """ Extract the story and summary from a story file.
+
+    Attributes:
+        raw_story (str): content of the story file as an utf-8 encoded string.
+
+    Raises:
+        IndexError: If the stoy is empty or contains no highlights.
+    """
+    nonempty_lines = list(
+        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
+    )
+
+    # for some unknown reason some lines miss a period, add it
+    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
+
+    # gather article lines
+    story_lines = []
+    lines = deque(nonempty_lines)
+    while True:
+        try:
+            element = lines.popleft()
+            if element.startswith("@highlight"):
+                break
+            story_lines.append(element)
+        except IndexError:
+            # if "@highlight" is absent from the file we pop
+            # all elements until there is None, raising an exception.
+            return story_lines, []
+
+    # gather summary lines
+    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
+
+    return story_lines, summary_lines
+
+
+def _add_missing_period(line):
+    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
+    if line.startswith("@highlight"):
+        return line
+    if line[-1] in END_TOKENS:
+        return line
+    return line + "."
+
+
+# --------------------------
+# Encoding and preprocessing
+# --------------------------
+
+
+def fit_to_block_size(sequence, block_size, pad_token_id):
+    """ Adapt the source and target sequences' lengths to the block size.
+    If the sequence is shorter we append padding token to the right of the sequence.
+    """
+    if len(sequence) > block_size:
+        return sequence[:block_size]
+    else:
+        sequence.extend([pad_token_id] * (block_size - len(sequence)))
+        return sequence
+
+
+def build_mask(sequence, pad_token_id):
+    """ Builds the mask. The attention mechanism will only attend to positions
+    with value 1. """
+    mask = torch.ones_like(sequence)
+    idx_pad_tokens = sequence == pad_token_id
+    mask[idx_pad_tokens] = 0
+    return mask
+
+
+def encode_for_summarization(story_lines, summary_lines, tokenizer):
+    """ Encode the story and summary lines, and join them
+    as specified in [1] by using `[SEP] [CLS]` tokens to separate
+    sentences.
+    """
+    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
+    story_token_ids = [
+        token for sentence in story_lines_token_ids for token in sentence
+    ]
+    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
+    summary_token_ids = [
+        token for sentence in summary_lines_token_ids for token in sentence
+    ]
+
+    return story_token_ids, summary_token_ids
+
+
+def compute_token_type_ids(batch, separator_token_id):
+    """ Segment embeddings as described in [1]
+
+    The values {0,1} were found in the repository [2].
+
+    Attributes:
+        batch: torch.Tensor, size [batch_size, block_size]
+            Batch of input.
+        separator_token_id: int
+            The value of the token that separates the segments.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
+    """
+    batch_embeddings = []
+    for sequence in batch:
+        sentence_num = -1
+        embeddings = []
+        for s in sequence:
+            if s == separator_token_id:
+                sentence_num += 1
+            embeddings.append(sentence_num % 2)
+        batch_embeddings.append(embeddings)
+    return torch.tensor(batch_embeddings)
--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+
+from utils_summarization import (
+    compute_token_type_ids,
+    fit_to_block_size,
+    build_mask,
+    process_story,
+)
+
+
+class SummarizationDataProcessingTest(unittest.TestCase):
+    def setUp(self):
+        self.block_size = 10
+
+    def test_fit_to_block_sequence_too_small(self):
+        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        sequence = [1, 2, 3, 4]
+        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_fit_exactly(self):
+        """ Do nothing if the sequence is the right size. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_too_big(self):
+        """ Truncate the sequence if it is too long. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_process_story_no_highlights(self):
+        """ Processing a story with no highlights returns an empty list for the summary.
+        """
+        raw_story = """It was the year of Our Lord one thousand seven hundred and
+        seventy-five.\n\nSpiritual revelations were conceded to England at that
+        favoured period, as at this."""
+        _, summary_lines = process_story(raw_story)
+        self.assertEqual(summary_lines, [])
+
+    def test_process_empty_story(self):
+        """ An empty story returns an empty collection of lines.
+        """
+        raw_story = ""
+        story_lines, summary_lines = process_story(raw_story)
+        self.assertEqual(story_lines, [])
+        self.assertEqual(summary_lines, [])
+
+    def test_process_story_with_missing_period(self):
+        raw_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five\n\nSpiritual revelations were conceded to England "
+            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
+        )
+        story_lines, summary_lines = process_story(raw_story)
+
+        expected_story_lines = [
+            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
+            "Spiritual revelations were conceded to England at that favoured period, as at this.",
+        ]
+        self.assertEqual(expected_story_lines, story_lines)
+
+        expected_summary_lines = ["It was the best of times."]
+        self.assertEqual(expected_summary_lines, summary_lines)
+
+    def test_build_mask_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = torch.tensor([1, 1, 1, 1])
+        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
+
+    def test_build_mask(self):
+        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(
+            build_mask(sequence, 23).numpy(), expected.numpy()
+        )
+
+    def test_build_mask_with_padding_equal_to_one(self):
+        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
+
+    def test_compute_token_type_ids(self):
+        separator = 101
+        batch = torch.tensor(
+            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
+        )
+        expected = torch.tensor(
+            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
+        )
+
+        result = compute_token_type_ids(batch, separator)
+        np.testing.assert_array_equal(result, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
        logger.addHandler(stream_handler)

        testargs = ["run_squad.py",
-                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
-                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--data_dir=./examples/tests_samples/SQUAD",
                    "--model_name=bert-base-uncased",
                    "--output_dir=./examples/tests_samples/temp_dir",
                    "--max_steps=10",

--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
--- a/examples/tests_samples/SQUAD/train-v2.0.json
+++ b/examples/tests_samples/SQUAD/train-v2.0.json
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
\ No newline at end of file
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
-""" Official evaluation script for SQuAD version 2.0.
-    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
-
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-import argparse
-import collections
-import json
-import numpy as np
-import os
-import re
-import string
-import sys
-
-class EVAL_OPTS():
-  def __init__(self, data_file, pred_file, out_file="",
-               na_prob_file="na_prob.json", na_prob_thresh=1.0,
-               out_image_dir=None, verbose=False):
-    self.data_file = data_file
-    self.pred_file = pred_file
-    self.out_file = out_file
-    self.na_prob_file = na_prob_file
-    self.na_prob_thresh = na_prob_thresh
-    self.out_image_dir = out_image_dir
-    self.verbose = verbose
-
-OPTS = None
-
-def parse_args():
-  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
-  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
-  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
-  parser.add_argument('--out-file', '-o', metavar='eval.json',
-                      help='Write accuracy metrics to file (default is stdout).')
-  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
-                      help='Model estimates of probability of no answer.')
-  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
-                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
-  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
-                      help='Save precision-recall curves to directory.')
-  parser.add_argument('--verbose', '-v', action='store_true')
-  if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-  return parser.parse_args()
-
-def make_qid_to_has_ans(dataset):
-  qid_to_has_ans = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid_to_has_ans[qa['id']] = bool(qa['answers'])
-  return qid_to_has_ans
-
-def normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-  def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
-  def white_space_fix(text):
-    return ' '.join(text.split())
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
-  def lower(text):
-    return text.lower()
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-def get_tokens(s):
-  if not s: return []
-  return normalize_answer(s).split()
-
-def compute_exact(a_gold, a_pred):
-  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-def compute_f1(a_gold, a_pred):
-  gold_toks = get_tokens(a_gold)
-  pred_toks = get_tokens(a_pred)
-  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-  num_same = sum(common.values())
-  if len(gold_toks) == 0 or len(pred_toks) == 0:
-    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-    return int(gold_toks == pred_toks)
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(pred_toks)
-  recall = 1.0 * num_same / len(gold_toks)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-def get_raw_scores(dataset, preds):
-  exact_scores = {}
-  f1_scores = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid = qa['id']
-        gold_answers = [a['text'] for a in qa['answers']
-                        if normalize_answer(a['text'])]
-        if not gold_answers:
-          # For unanswerable questions, only correct answer is empty string
-          gold_answers = ['']
-        if qid not in preds:
-          print('Missing prediction for %s' % qid)
-          continue
-        a_pred = preds[qid]
-        # Take max over all gold answers
-        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
-        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
-  return exact_scores, f1_scores
-
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-  new_scores = {}
-  for qid, s in scores.items():
-    pred_na = na_probs[qid] > na_prob_thresh
-    if pred_na:
-      new_scores[qid] = float(not qid_to_has_ans[qid])
-    else:
-      new_scores[qid] = s
-  return new_scores
-
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-  if not qid_list:
-    total = len(exact_scores)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores.values()) / total),
-        ('f1', 100.0 * sum(f1_scores.values()) / total),
-        ('total', total),
-    ])
-  else:
-    total = len(qid_list)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        ('total', total),
-    ])
-
-def merge_eval(main_eval, new_eval, prefix):
-  for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-
-def plot_pr_curve(precisions, recalls, out_image, title):
-  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
-  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
-  plt.xlabel('Recall')
-  plt.ylabel('Precision')
-  plt.xlim([0.0, 1.05])
-  plt.ylim([0.0, 1.05])
-  plt.title(title)
-  plt.savefig(out_image)
-  plt.clf()
-
-def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
-                               out_image=None, title=None):
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  true_pos = 0.0
-  cur_p = 1.0
-  cur_r = 0.0
-  precisions = [1.0]
-  recalls = [0.0]
-  avg_prec = 0.0
-  for i, qid in enumerate(qid_list):
-    if qid_to_has_ans[qid]:
-      true_pos += scores[qid]
-    cur_p = true_pos / float(i+1)
-    cur_r = true_pos / float(num_true_pos)
-    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
-      # i.e., if we can put a threshold after this point
-      avg_prec += cur_p * (cur_r - recalls[-1])
-      precisions.append(cur_p)
-      recalls.append(cur_r)
-  if out_image:
-    plot_pr_curve(precisions, recalls, out_image, title)
-  return {'ap': 100.0 * avg_prec}
-
-def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, out_image_dir):
-  if out_image_dir and not os.path.exists(out_image_dir):
-    os.makedirs(out_image_dir)
-  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
-  if num_true_pos == 0:
-    return
-  pr_exact = make_precision_recall_eval(
-      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
-      title='Precision-Recall curve for Exact Match score')
-  pr_f1 = make_precision_recall_eval(
-      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
-      title='Precision-Recall curve for F1 score')
-  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
-  pr_oracle = make_precision_recall_eval(
-      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
-      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
-  merge_eval(main_eval, pr_exact, 'pr_exact')
-  merge_eval(main_eval, pr_f1, 'pr_f1')
-  merge_eval(main_eval, pr_oracle, 'pr_oracle')
-
-def histogram_na_prob(na_probs, qid_list, image_dir, name):
-  if not qid_list:
-    return
-  x = [na_probs[k] for k in qid_list]
-  weights = np.ones_like(x) / float(len(x))
-  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
-  plt.xlabel('Model probability of no-answer')
-  plt.ylabel('Proportion of dataset')
-  plt.title('Histogram of no-answer probability: %s' % name)
-  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
-  plt.clf()
-
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  return 100.0 * best_score / len(scores), best_thresh
-
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-
-  has_ans_score, has_ans_cnt = 0, 0
-  for qid in qid_list:
-    if not qid_to_has_ans[qid]: continue
-    has_ans_cnt += 1
-
-    if qid not in scores: continue
-    has_ans_score += scores[qid]
-
-  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-  main_eval['has_ans_exact'] = has_ans_exact
-  main_eval['has_ans_f1'] = has_ans_f1
-
-def main(OPTS):
-  with open(OPTS.data_file) as f:
-    dataset_json = json.load(f)
-    dataset = dataset_json['data']
-  with open(OPTS.pred_file) as f:
-    preds = json.load(f)
-  if OPTS.na_prob_file:
-    with open(OPTS.na_prob_file) as f:
-      na_probs = json.load(f)
-  else:
-    na_probs = {k: 0.0 for k in preds}
-  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
-  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-  exact_raw, f1_raw = get_raw_scores(dataset, preds)
-  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
-                                        OPTS.na_prob_thresh)
-  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
-                                     OPTS.na_prob_thresh)
-  out_eval = make_eval_dict(exact_thresh, f1_thresh)
-  if has_ans_qids:
-    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    merge_eval(out_eval, has_ans_eval, 'HasAns')
-  if no_ans_qids:
-    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    merge_eval(out_eval, no_ans_eval, 'NoAns')
-  if OPTS.na_prob_file:
-    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  if OPTS.na_prob_file and OPTS.out_image_dir:
-    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, OPTS.out_image_dir)
-    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
-    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
-  if OPTS.out_file:
-    with open(OPTS.out_file, 'w') as f:
-      json.dump(out_eval, f)
-  else:
-    print(json.dumps(out_eval, indent=2))
-  return out_eval
-
-if __name__ == '__main__':
-  OPTS = parse_args()
-  if OPTS.out_image_dir:
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt 
-  main(OPTS)
--- a/setup.py
+++ b/setup.py
@@ -36,9 +36,15 @@ To create the package for pypi.
 from io import open
 from setuptools import find_packages, setup

+
+extras = {
+    'serving': ['uvicorn', 'fastapi']
+}
+extras['all'] = [package for package in extras.values()]
+
 setup(
    name="transformers",
-    version="2.1.1",
+    version="2.2.1",
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -61,8 +67,11 @@ setup(
        "transformers=transformers.__main__:main",
      ]
    },
+    extras_require=extras,
+    scripts=[
+        'transformers-cli'
+    ],
    # python_requires='>=3.5.0',
-    tests_require=['pytest'],
    classifiers=[
          'Intended Audience :: Science/Research',
          'License :: OSI Approved :: Apache Software License',

--- a/templates/adding_a_new_example_script/README.md
+++ b/templates/adding_a_new_example_script/README.md
+# How to add a new example script in 🤗Transformers
+
+This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
+
+Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.