Merge branch 'master' into t5

0558c9cb · thomwolf · 608a8f5b · e57d00ee · 0558c9cb · 0558c9cb
Commit 0558c9cb authored Dec 10, 2019 by thomwolf
20 changed files
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -41,7 +41,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,

 from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
-                                     WarmupLinearSchedule)
+                                     get_linear_schedule_with_warmup)

 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

@@ -211,7 +211,7 @@ def main():
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@@ -237,7 +237,7 @@ def main():
    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)

--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -42,7 +42,7 @@ from tqdm import tqdm, trange
 from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForMultipleChoice, BertTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 logger = logging.getLogger(__name__)

@@ -322,7 +322,7 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp

--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -2,6 +2,10 @@

 This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.

+**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
+
+**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
+
 **October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.

 **October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
@@ -15,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT

 We have applied the same method to other Transformer architectures and released the weights:
 - GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller.
- and more to come! 🤗🤗🤗
+- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
+- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
+- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).

 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).

@@ -27,7 +32,7 @@ Here are the results on the dev sets of GLUE:
 | BERT-base                 |  **77.6**                      | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7              |
 | DistilBERT                |  **76.8**                      | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4              |
 | ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
-| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
+| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
 | DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |

 <sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
@@ -36,6 +41,14 @@ Here are the results on the dev sets of GLUE:

 <sup>3</sup> We compute this score ourselves for completeness.

+Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
+
+| Model                        | English | Spanish | Chinese | German | Arabic  | Urdu |
+| :---:                        | :---:   | :---:   | :---:   | :---:  | :---:   | :---:|
+| mBERT base cased (computed)  | 82.1    | 74.6    | 69.1    | 72.3   | 66.4    | 58.5 |
+| mBERT base uncased (reported)| 81.4    | 74.3    | 63.8    | 70.5   | 62.1    | 58.3 |
+| DistilmBERT                  | 78.2    | 69.1    | 64.0    | 66.3   | 59.1    | 54.7 |
+
 ## Setup

 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
@@ -45,13 +58,14 @@ This part of the library has only be tested with Python3.6+. There are few speci

 ## How to use DistilBERT

-Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):

 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
 - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
 - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
- and more to come! 🤗🤗🤗
+- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.

 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.

@@ -67,6 +81,7 @@ last_hidden_states = outputs[0]  # The last hidden-state is the first element of
 Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
 - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
 - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
+- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`


 ## How to train Distil*

--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -21,7 +21,6 @@ import psutil
 import time
 from tqdm import trange, tqdm
 import numpy as np
-import psutil

 import torch
 import torch.nn as nn
@@ -35,7 +34,7 @@ try:
 except:
    from tensorboardX import SummaryWriter

-from transformers import WarmupLinearSchedule
+from transformers import get_linear_schedule_with_warmup

 from utils import logger
 from lm_seqs_dataset import LmSeqsDataset
@@ -137,9 +136,9 @@ class Distiller:
                               betas=(0.9, 0.98))

        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = WarmupLinearSchedule(self.optimizer,
-                                                warmup_steps=warmup_steps,
-                                                t_total=num_train_optimization_steps)
+        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
+                                                num_warmup_steps=warmup_steps,
+                                                num_training_steps=num_train_optimization_steps)

        if self.fp16:
            try:

--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
-transformers==2.0.0
+transformers
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -46,7 +46,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer,
                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from ..utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp

--- a/examples/pplm/README.md
+++ b/examples/pplm/README.md
+# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
+
+Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
+
+This folder contains the original code used to run the Plug and Play Language Model (PPLM).
+
+Paper link: https://arxiv.org/abs/1912.02164
+
+Blog link: https://eng.uber.com/pplm
+
+Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
+
+
+## Setup
+
+```bash
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install [--editable] .
+pip install nltk torchtext # additional requirements.
+cd examples/pplm
+```
+
+## PPLM-BoW 
+
+### Example command for bag-of-words control
+
+```bash
+python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
+```
+
+### Tuning hyperparameters for bag-of-words control
+
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+
+2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
+	a) Reduce the `--stepsize` </br>
+	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
+	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
+
+
+## PPLM-Discrim
+
+### Example command for discriminator based sentiment control
+
+```bash
+python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
+```
+
+### Tuning hyperparameters for discriminator control
+
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+
+2. Use `--class_label 3` for negative, and `--class_label 2` for positive
+
--- a/examples/pplm/imgs/headfigure.png
+++ b/examples/pplm/imgs/headfigure.png
--- a/examples/pplm/imgs/wooly.png
+++ b/examples/pplm/imgs/wooly.png
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
+import torch
+
+class ClassificationHead(torch.nn.Module):
+    """Classification Head for  transformer encoders"""
+
+    def __init__(self, class_size, embed_size):
+        super(ClassificationHead, self).__init__()
+        self.class_size = class_size
+        self.embed_size = embed_size
+        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
+        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
+        self.mlp = torch.nn.Linear(embed_size, class_size)
+
+    def forward(self, hidden_state):
+        # hidden_state = F.relu(self.mlp1(hidden_state))
+        # hidden_state = self.mlp2(hidden_state)
+        logits = self.mlp(hidden_state)
+        return logits
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
+#! /usr/bin/env python3
+# coding=utf-8
+
+#Copyright (c) 2019 Uber Technologies, Inc.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+"""
+Example command with bag of words:
+python examples/run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
+
+Example command with discriminator:
+python examples/run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
+"""
+
+import argparse
+import json
+from operator import add
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from tqdm import trange
+
+from transformers import GPT2Tokenizer
+from transformers.file_utils import cached_path
+from transformers.modeling_gpt2 import GPT2LMHeadModel
+from pplm_classification_head import ClassificationHead
+
+PPLM_BOW = 1
+PPLM_DISCRIM = 2
+PPLM_BOW_DISCRIM = 3
+SMALL_CONST = 1e-15
+BIG_CONST = 1e10
+
+BAG_OF_WORDS_ARCHIVE_MAP = {
+    'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
+    'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
+    'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
+    'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
+    'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
+    'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
+    'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
+}
+
+DISCRIMINATOR_MODELS_PARAMS = {
+    "clickbait": {
+        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
+        "class_size": 2,
+        "embed_size": 1024,
+        "class_vocab": {"non_clickbait": 0, "clickbait": 1},
+        "default_class": 1,
+        "pretrained_model": "gpt2-medium",
+    },
+    "sentiment": {
+        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
+        "class_size": 5,
+        "embed_size": 1024,
+        "class_vocab": {"very_positive": 2, "very_negative": 3},
+        "default_class": 3,
+        "pretrained_model": "gpt2-medium",
+    },
+}
+
+
+def to_var(x, requires_grad=False, volatile=False, device='cuda'):
+    if torch.cuda.is_available() and device == 'cuda':
+        x = x.cuda()
+    elif device != 'cuda':
+        x = x.to(device)
+    return Variable(x, requires_grad=requires_grad, volatile=volatile)
+
+
+def top_k_filter(logits, k, probs=False):
+    """
+    Masks everything but the k top entries as -infinity (1e10).
+    Used to mask logits such that e^-infinity -> 0 won't contribute to the
+    sum of the denominator.
+    """
+    if k == 0:
+        return logits
+    else:
+        values = torch.topk(logits, k)[0]
+        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
+        if probs:
+            return torch.where(logits < batch_mins,
+                               torch.ones_like(logits) * 0.0, logits)
+        return torch.where(logits < batch_mins,
+                           torch.ones_like(logits) * -BIG_CONST,
+                           logits)
+
+
+def perturb_past(
+        past,
+        model,
+        last,
+        unpert_past=None,
+        unpert_logits=None,
+        accumulated_hidden=None,
+        grad_norms=None,
+        stepsize=0.01,
+        one_hot_bows_vectors=None,
+        classifier=None,
+        class_label=None,
+        loss_type=0,
+        num_iterations=3,
+        horizon_length=1,
+        window_length=0,
+        decay=False,
+        gamma=1.5,
+        kl_scale=0.01,
+        device='cuda',
+):
+    # Generate inital perturbed past
+    grad_accumulator = [
+        (np.zeros(p.shape).astype("float32"))
+        for p in past
+    ]
+
+    if accumulated_hidden is None:
+        accumulated_hidden = 0
+
+    if decay:
+        decay_mask = torch.arange(
+            0.,
+            1.0 + SMALL_CONST,
+            1.0 / (window_length)
+        )[1:]
+    else:
+        decay_mask = 1.0
+
+    # TODO fix this comment (SUMANTH)
+    # Generate a mask is gradient perturbated is based on a past window
+    _, _, _, curr_length, _ = past[0].shape
+
+    if curr_length > window_length and window_length > 0:
+        ones_key_val_shape = (
+                tuple(past[0].shape[:-2])
+                + tuple([window_length])
+                + tuple(past[0].shape[-1:])
+        )
+
+        zeros_key_val_shape = (
+                tuple(past[0].shape[:-2])
+                + tuple([curr_length - window_length])
+                + tuple(past[0].shape[-1:])
+        )
+
+        ones_mask = torch.ones(ones_key_val_shape)
+        ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
+        ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
+
+        window_mask = torch.cat(
+            (ones_mask, torch.zeros(zeros_key_val_shape)),
+            dim=-2
+        ).to(device)
+    else:
+        window_mask = torch.ones_like(past[0]).to(device)
+
+    # accumulate perturbations for num_iterations
+    loss_per_iter = []
+    new_accumulated_hidden = None
+    for i in range(num_iterations):
+        print("Iteration ", i + 1)
+        curr_perturbation = [
+            to_var(torch.from_numpy(p_), requires_grad=True, device=device)
+            for p_ in grad_accumulator
+        ]
+
+        # Compute hidden using perturbed past
+        perturbed_past = list(map(add, past, curr_perturbation))
+        _, _, _, curr_length, _ = curr_perturbation[0].shape
+        all_logits, _, all_hidden = model(last, past=perturbed_past)
+        hidden = all_hidden[-1]
+        new_accumulated_hidden = accumulated_hidden + torch.sum(
+            hidden,
+            dim=1
+        ).detach()
+        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
+        logits = all_logits[:, -1, :]
+        probs = F.softmax(logits, dim=-1)
+
+        loss = 0.0
+        loss_list = []
+        if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
+            for one_hot_bow in one_hot_bows_vectors:
+                bow_logits = torch.mm(probs, torch.t(one_hot_bow))
+                bow_loss = -torch.log(torch.sum(bow_logits))
+                loss += bow_loss
+                loss_list.append(bow_loss)
+            print(" pplm_bow_loss:", loss.data.cpu().numpy())
+
+        if loss_type == 2 or loss_type == 3:
+            ce_loss = torch.nn.CrossEntropyLoss()
+            # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
+            curr_unpert_past = unpert_past
+            curr_probs = torch.unsqueeze(probs, dim=1)
+            wte = model.resize_token_embeddings()
+            for _ in range(horizon_length):
+                inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
+                _, curr_unpert_past, curr_all_hidden = model(
+                    past=curr_unpert_past,
+                    inputs_embeds=inputs_embeds
+                )
+                curr_hidden = curr_all_hidden[-1]
+                new_accumulated_hidden = new_accumulated_hidden + torch.sum(
+                    curr_hidden, dim=1)
+
+            prediction = classifier(new_accumulated_hidden /
+                                    (curr_length + 1 + horizon_length))
+
+            label = torch.tensor(prediction.shape[0] * [class_label],
+                                 device=device,
+                                 dtype=torch.long)
+            discrim_loss = ce_loss(prediction, label)
+            print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
+            loss += discrim_loss
+            loss_list.append(discrim_loss)
+
+        kl_loss = 0.0
+        if kl_scale > 0.0:
+            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = (
+                    unpert_probs + SMALL_CONST *
+                    (unpert_probs <= SMALL_CONST).float().to(device).detach()
+            )
+            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
+                device).detach()
+            corrected_probs = probs + correction.detach()
+            kl_loss = kl_scale * (
+                (corrected_probs * (corrected_probs / unpert_probs).log()).sum()
+            )
+            print(' kl_loss', kl_loss.data.cpu().numpy())
+            loss += kl_loss
+
+        loss_per_iter.append(loss.data.cpu().numpy())
+        print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
+
+        # compute gradients
+        loss.backward()
+
+        # calculate gradient norms
+        if grad_norms is not None and loss_type == PPLM_BOW:
+            grad_norms = [
+                torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
+                for index, p_ in enumerate(curr_perturbation)
+            ]
+        else:
+            grad_norms = [
+                (torch.norm(p_.grad * window_mask) + SMALL_CONST)
+                for index, p_ in enumerate(curr_perturbation)
+            ]
+
+        # normalize gradients
+        grad = [
+            -stepsize *
+            (p_.grad * window_mask / grad_norms[
+                index] ** gamma).data.cpu().numpy()
+            for index, p_ in enumerate(curr_perturbation)
+        ]
+
+        # accumulate gradient
+        grad_accumulator = list(map(add, grad, grad_accumulator))
+
+        # reset gradients, just to make sure
+        for p_ in curr_perturbation:
+            p_.grad.data.zero_()
+
+        # removing past from the graph
+        new_past = []
+        for p_ in past:
+            new_past.append(p_.detach())
+        past = new_past
+
+    # apply the accumulated perturbations to the past
+    grad_accumulator = [
+        to_var(torch.from_numpy(p_), requires_grad=True, device=device)
+        for p_ in grad_accumulator
+    ]
+    pert_past = list(map(add, past, grad_accumulator))
+
+    return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
+
+
+def get_classifier(
+        name: Optional[str], class_label: Union[str, int],
+        device: str
+) -> Tuple[Optional[ClassificationHead], Optional[int]]:
+    if name is None:
+        return None, None
+
+    params = DISCRIMINATOR_MODELS_PARAMS[name]
+    classifier = ClassificationHead(
+        class_size=params['class_size'],
+        embed_size=params['embed_size']
+    ).to(device)
+    if "url" in params:
+        resolved_archive_file = cached_path(params["url"])
+    elif "path" in params:
+        resolved_archive_file = params["path"]
+    else:
+        raise ValueError("Either url or path have to be specified "
+                         "in the discriminator model parameters")
+    classifier.load_state_dict(
+        torch.load(resolved_archive_file, map_location=device))
+    classifier.eval()
+
+    if isinstance(class_label, str):
+        if class_label in params["class_vocab"]:
+            label_id = params["class_vocab"][class_label]
+        else:
+            label_id = params["default_class"]
+            print("class_label {} not in class_vocab".format(class_label))
+            print("available values are: {}".format(params["class_vocab"]))
+            print("using default class {}".format(label_id))
+
+    elif isinstance(class_label, int):
+        if class_label in set(params["class_vocab"].values()):
+            label_id = class_label
+        else:
+            label_id = params["default_class"]
+            print("class_label {} not in class_vocab".format(class_label))
+            print("available values are: {}".format(params["class_vocab"]))
+            print("using default class {}".format(label_id))
+
+    else:
+        label_id = params["default_class"]
+
+    return classifier, label_id
+
+
+def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
+        List[List[List[int]]]:
+    bow_indices = []
+    for id_or_path in bag_of_words_ids_or_paths:
+        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
+            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
+        else:
+            filepath = id_or_path
+        with open(filepath, "r") as f:
+            words = f.read().strip().split("\n")
+        bow_indices.append(
+            [tokenizer.encode(word.strip(), add_prefix_space=True) for word in
+             words])
+    return bow_indices
+
+
+def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
+    if bow_indices is None:
+        return None
+
+    one_hot_bows_vectors = []
+    for single_bow in bow_indices:
+        single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
+        single_bow = torch.tensor(single_bow).to(device)
+        num_words = single_bow.shape[0]
+        one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
+        one_hot_bow.scatter_(1, single_bow, 1)
+        one_hot_bows_vectors.append(one_hot_bow)
+    return one_hot_bows_vectors
+
+
+def full_text_generation(
+        model,
+        tokenizer,
+        context=None,
+        num_samples=1,
+        device="cuda",
+        bag_of_words=None,
+        discrim=None,
+        class_label=None,
+        length=100,
+        stepsize=0.02,
+        temperature=1.0,
+        top_k=10,
+        sample=False,
+        num_iterations=3,
+        grad_length=10000,
+        horizon_length=1,
+        window_length=0,
+        decay=False,
+        gamma=1.5,
+        gm_scale=0.9,
+        kl_scale=0.01,
+        **kwargs
+):
+    classifier, class_id = get_classifier(
+        discrim,
+        class_label,
+        device
+    )
+
+    bow_indices = []
+    if bag_of_words:
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
+                                               tokenizer)
+
+    if bag_of_words and classifier:
+        print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
+        loss_type = PPLM_BOW_DISCRIM
+
+    elif bag_of_words:
+        loss_type = PPLM_BOW
+        print("Using PPLM-BoW")
+
+    elif classifier is not None:
+        loss_type = PPLM_DISCRIM
+        print("Using PPLM-Discrim")
+
+    else:
+        raise Exception("Specify either a bag of words or a discriminator")
+
+    unpert_gen_tok_text, _, _ = generate_text_pplm(
+        model=model,
+        tokenizer=tokenizer,
+        context=context,
+        device=device,
+        length=length,
+        sample=sample,
+        perturb=False
+    )
+    if device == 'cuda':
+        torch.cuda.empty_cache()
+
+    pert_gen_tok_texts = []
+    discrim_losses = []
+    losses_in_time = []
+
+    for i in range(num_samples):
+        pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
+            model=model,
+            tokenizer=tokenizer,
+            context=context,
+            device=device,
+            perturb=True,
+            bow_indices=bow_indices,
+            classifier=classifier,
+            class_label=class_id,
+            loss_type=loss_type,
+            length=length,
+            stepsize=stepsize,
+            temperature=temperature,
+            top_k=top_k,
+            sample=sample,
+            num_iterations=num_iterations,
+            grad_length=grad_length,
+            horizon_length=horizon_length,
+            window_length=window_length,
+            decay=decay,
+            gamma=gamma,
+            gm_scale=gm_scale,
+            kl_scale=kl_scale,
+        )
+        pert_gen_tok_texts.append(pert_gen_tok_text)
+        if classifier is not None:
+            discrim_losses.append(discrim_loss.data.cpu().numpy())
+        losses_in_time.append(loss_in_time)
+
+    if device == 'cuda':
+        torch.cuda.empty_cache()
+
+    return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
+
+
+def generate_text_pplm(
+        model,
+        tokenizer,
+        context=None,
+        past=None,
+        device="cuda",
+        perturb=True,
+        bow_indices=None,
+        classifier=None,
+        class_label=None,
+        loss_type=0,
+        length=100,
+        stepsize=0.02,
+        temperature=1.0,
+        top_k=10,
+        sample=False,
+        num_iterations=3,
+        grad_length=10000,
+        horizon_length=1,
+        window_length=0,
+        decay=False,
+        gamma=1.5,
+        gm_scale=0.9,
+        kl_scale=0.01,
+):
+    output_so_far = None
+    if context:
+        context_t = torch.tensor(context, device=device, dtype=torch.long)
+        while len(context_t.shape) < 2:
+            context_t = context_t.unsqueeze(0)
+        output_so_far = context_t
+
+    # collect one hot vectors for bags of words
+    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
+                                                      device)
+
+    grad_norms = None
+    last = None
+    unpert_discrim_loss = 0
+    loss_in_time = []
+    for i in trange(length, ascii=True):
+
+        # Get past/probs for current output, except for last word
+        # Note that GPT takes 2 inputs: past + current_token
+
+        # run model forward to obtain unperturbed
+        if past is None and output_so_far is not None:
+            last = output_so_far[:, -1:]
+            if output_so_far.shape[1] > 1:
+                _, past, _ = model(output_so_far[:, :-1])
+
+        unpert_logits, unpert_past, unpert_all_hidden = model(output_so_far)
+        unpert_last_hidden = unpert_all_hidden[-1]
+
+        # check if we are abowe grad max length
+        if i >= grad_length:
+            current_stepsize = stepsize * 0
+        else:
+            current_stepsize = stepsize
+
+        # modify the past if necessary
+        if not perturb or num_iterations == 0:
+            pert_past = past
+
+        else:
+            accumulated_hidden = unpert_last_hidden[:, :-1, :]
+            accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
+
+            if past is not None:
+                pert_past, _, grad_norms, loss_this_iter = perturb_past(
+                    past,
+                    model,
+                    last,
+                    unpert_past=unpert_past,
+                    unpert_logits=unpert_logits,
+                    accumulated_hidden=accumulated_hidden,
+                    grad_norms=grad_norms,
+                    stepsize=current_stepsize,
+                    one_hot_bows_vectors=one_hot_bows_vectors,
+                    classifier=classifier,
+                    class_label=class_label,
+                    loss_type=loss_type,
+                    num_iterations=num_iterations,
+                    horizon_length=horizon_length,
+                    window_length=window_length,
+                    decay=decay,
+                    gamma=gamma,
+                    kl_scale=kl_scale,
+                    device=device,
+                )
+                loss_in_time.append(loss_this_iter)
+            else:
+                pert_past = past
+
+        pert_logits, past, pert_all_hidden = model(last, past=pert_past)
+        pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
+        pert_probs = F.softmax(pert_logits, dim=-1)
+
+        if classifier is not None:
+            ce_loss = torch.nn.CrossEntropyLoss()
+            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
+            label = torch.tensor([class_label], device=device,
+                                 dtype=torch.long)
+            unpert_discrim_loss = ce_loss(prediction, label)
+            print(
+                "unperturbed discrim loss",
+                unpert_discrim_loss.data.cpu().numpy()
+            )
+        else:
+            unpert_discrim_loss = 0
+
+        # Fuse the modified model and original model
+        if perturb:
+
+            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+
+            pert_probs = ((pert_probs ** gm_scale) * (
+                    unpert_probs ** (1 - gm_scale)))  # + SMALL_CONST
+            pert_probs = top_k_filter(pert_probs, k=top_k,
+                                      probs=True)  # + SMALL_CONST
+
+            # rescale
+            if torch.sum(pert_probs) <= 1:
+                pert_probs = pert_probs / torch.sum(pert_probs)
+
+        else:
+            pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
+            pert_probs = F.softmax(pert_logits, dim=-1)
+
+        # sample or greedy
+        if sample:
+            last = torch.multinomial(pert_probs, num_samples=1)
+
+        else:
+            _, last = torch.topk(pert_probs, k=1, dim=-1)
+
+        # update context/output_so_far appending the new token
+        output_so_far = (
+            last if output_so_far is None
+            else torch.cat((output_so_far, last), dim=1)
+        )
+
+        print(tokenizer.decode(output_so_far.tolist()[0]))
+
+    return output_so_far, unpert_discrim_loss, loss_in_time
+
+
+def set_generic_model_params(discrim_weights, discrim_meta):
+    if discrim_weights is None:
+        raise ValueError('When using a generic discriminator, '
+                         'discrim_weights need to be specified')
+    if discrim_meta is None:
+        raise ValueError('When using a generic discriminator, '
+                         'discrim_meta need to be specified')
+
+    with open(discrim_meta, 'r') as discrim_meta_file:
+        meta = json.load(discrim_meta_file)
+    meta['path'] = discrim_weights
+    DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
+
+
+def run_pplm_example(
+        pretrained_model="gpt2-medium",
+        cond_text="",
+        uncond=False,
+        num_samples=1,
+        bag_of_words=None,
+        discrim=None,
+        discrim_weights=None,
+        discrim_meta=None,
+        class_label=-1,
+        length=100,
+        stepsize=0.02,
+        temperature=1.0,
+        top_k=10,
+        sample=False,
+        num_iterations=3,
+        grad_length=10000,
+        horizon_length=1,
+        window_length=0,
+        decay=False,
+        gamma=1.5,
+        gm_scale=0.9,
+        kl_scale=0.01,
+        seed=0,
+        no_cuda=False,
+        colorama=False
+):
+    # set Random seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    # set the device
+    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
+
+    if discrim == 'generic':
+        set_generic_model_params(discrim_weights, discrim_meta)
+
+    if discrim is not None:
+        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
+            "pretrained_model"
+        ]
+        print("discrim = {}, pretrained_model set "
+              "to discriminator's = {}".format(discrim, pretrained_model))
+
+    # load pretrained model
+    model = GPT2LMHeadModel.from_pretrained(
+        pretrained_model,
+        output_hidden_states=True
+    )
+    model.to(device)
+    model.eval()
+
+    # load tokenizer
+    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
+
+    # Freeze GPT-2 weights
+    for param in model.parameters():
+        param.requires_grad = False
+
+    # figure out conditioning text
+    if uncond:
+        tokenized_cond_text = tokenizer.encode(
+            [tokenizer.bos_token]
+        )
+    else:
+        raw_text = cond_text
+        while not raw_text:
+            print("Did you forget to add `--cond_text`? ")
+            raw_text = input("Model prompt >>> ")
+        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
+
+    print("= Prefix of sentence =")
+    print(tokenizer.decode(tokenized_cond_text))
+    print()
+
+    # generate unperturbed and perturbed texts
+
+    # full_text_generation returns:
+    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
+    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
+        model=model,
+        tokenizer=tokenizer,
+        context=tokenized_cond_text,
+        device=device,
+        num_samples=num_samples,
+        bag_of_words=bag_of_words,
+        discrim=discrim,
+        class_label=class_label,
+        length=length,
+        stepsize=stepsize,
+        temperature=temperature,
+        top_k=top_k,
+        sample=sample,
+        num_iterations=num_iterations,
+        grad_length=grad_length,
+        horizon_length=horizon_length,
+        window_length=window_length,
+        decay=decay,
+        gamma=gamma,
+        gm_scale=gm_scale,
+        kl_scale=kl_scale,
+    )
+
+    # untokenize unperturbed text
+    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
+
+    print("=" * 80)
+    print("= Unperturbed generated text =")
+    print(unpert_gen_text)
+    print()
+
+    generated_texts = []
+
+    bow_word_ids = set()
+    if bag_of_words and colorama:
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
+                                               tokenizer)
+        for single_bow_list in bow_indices:
+            # filtering all words in the list composed of more than 1 token
+            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
+            # w[0] because we are sure w has only 1 item because previous fitler
+            bow_word_ids.update(w[0] for w in filtered)
+
+    # iterate through the perturbed texts
+    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
+        try:
+            # untokenize unperturbed text
+            if colorama:
+                import colorama
+
+                pert_gen_text = ''
+                for word_id in pert_gen_tok_text.tolist()[0]:
+                    if word_id in bow_word_ids:
+                        pert_gen_text += '{}{}{}'.format(
+                            colorama.Fore.RED,
+                            tokenizer.decode([word_id]),
+                            colorama.Style.RESET_ALL
+                        )
+                    else:
+                        pert_gen_text += tokenizer.decode([word_id])
+            else:
+                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
+
+            print("= Perturbed generated text {} =".format(i + 1))
+            print(pert_gen_text)
+            print()
+        except:
+            pass
+
+        # keep the prefix, perturbed seq, original seq for each index
+        generated_texts.append(
+            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
+        )
+
+    return
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model",
+        "-M",
+        type=str,
+        default="gpt2-medium",
+        help="pretrained model name or path to local checkpoint",
+    )
+    parser.add_argument(
+        "--cond_text", type=str, default="The lake",
+        help="Prefix texts to condition on"
+    )
+    parser.add_argument(
+        "--uncond", action="store_true",
+        help="Generate from end-of-text as prefix"
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate from the modified latents",
+    )
+    parser.add_argument(
+        "--bag_of_words",
+        "-B",
+        type=str,
+        default=None,
+        help="Bags of words used for PPLM-BoW. "
+             "Either a BOW id (see list in code) or a filepath. "
+             "Multiple BoWs separated by ;",
+    )
+    parser.add_argument(
+        "--discrim",
+        "-D",
+        type=str,
+        default=None,
+        choices=("clickbait", "sentiment", "toxicity", "generic"),
+        help="Discriminator to use",
+    )
+    parser.add_argument('--discrim_weights', type=str, default=None,
+                        help='Weights for the generic discriminator')
+    parser.add_argument('--discrim_meta', type=str, default=None,
+                        help='Meta information for the generic discriminator')
+    parser.add_argument(
+        "--class_label",
+        type=int,
+        default=-1,
+        help="Class label used for the discriminator",
+    )
+    parser.add_argument("--length", type=int, default=100)
+    parser.add_argument("--stepsize", type=float, default=0.02)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=10)
+    parser.add_argument(
+        "--sample", action="store_true",
+        help="Generate from end-of-text as prefix"
+    )
+    parser.add_argument("--num_iterations", type=int, default=3)
+    parser.add_argument("--grad_length", type=int, default=10000)
+    parser.add_argument(
+        "--window_length",
+        type=int,
+        default=0,
+        help="Length of past which is being optimized; "
+             "0 corresponds to infinite window length",
+    )
+    parser.add_argument(
+        "--horizon_length",
+        type=int,
+        default=1,
+        help="Length of future to optimize over",
+    )
+    parser.add_argument("--decay", action="store_true",
+                        help="whether to decay or not")
+    parser.add_argument("--gamma", type=float, default=1.5)
+    parser.add_argument("--gm_scale", type=float, default=0.9)
+    parser.add_argument("--kl_scale", type=float, default=0.01)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
+    parser.add_argument("--colorama", action="store_true",
+                        help="colors keywords")
+
+    args = parser.parse_args()
+    run_pplm_example(**vars(args))
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
+#! /usr/bin/env python3
+# coding=utf-8
+
+#Copyright (c) 2019 Uber Technologies, Inc.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import argparse
+import csv
+import json
+import math
+import time
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.optim
+import torch.optim as optim
+import torch.utils.data as data
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+from torchtext import data as torchtext_data
+from torchtext import datasets
+from tqdm import tqdm, trange
+
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from pplm_classification_head import ClassificationHead
+
+torch.manual_seed(0)
+np.random.seed(0)
+EPSILON = 1e-10
+example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
+max_length_seq = 100
+
+
+
+
+class Discriminator(torch.nn.Module):
+    """Transformer encoder followed by a Classification Head"""
+
+    def __init__(
+            self,
+            class_size,
+            pretrained_model="gpt2-medium",
+            cached_mode=False,
+            device='cpu'
+    ):
+        super(Discriminator, self).__init__()
+        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
+        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
+        self.embed_size = self.encoder.transformer.config.hidden_size
+        self.classifier_head = ClassificationHead(
+            class_size=class_size,
+            embed_size=self.embed_size
+        )
+        self.cached_mode = cached_mode
+        self.device = device
+
+    def get_classifier(self):
+        return self.classifier_head
+
+    def train_custom(self):
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        self.classifier_head.train()
+
+    def avg_representation(self, x):
+        mask = x.ne(0).unsqueeze(2).repeat(
+            1, 1, self.embed_size
+        ).float().to(self.device).detach()
+        hidden, _ = self.encoder.transformer(x)
+        masked_hidden = hidden * mask
+        avg_hidden = torch.sum(masked_hidden, dim=1) / (
+                torch.sum(mask, dim=1).detach() + EPSILON
+        )
+        return avg_hidden
+
+    def forward(self, x):
+        if self.cached_mode:
+            avg_hidden = x.to(self.device)
+        else:
+            avg_hidden = self.avg_representation(x.to(self.device))
+
+        logits = self.classifier_head(avg_hidden)
+        probs = F.log_softmax(logits, dim=-1)
+
+        return probs
+
+
+class Dataset(data.Dataset):
+    def __init__(self, X, y):
+        """Reads source and target sequences from txt files."""
+        self.X = X
+        self.y = y
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, index):
+        """Returns one data pair (source and target)."""
+        data = {}
+        data["X"] = self.X[index]
+        data["y"] = self.y[index]
+        return data
+
+
+def collate_fn(data):
+    def pad_sequences(sequences):
+        lengths = [len(seq) for seq in sequences]
+
+        padded_sequences = torch.zeros(
+            len(sequences),
+            max(lengths)
+        ).long()  # padding value = 0
+
+        for i, seq in enumerate(sequences):
+            end = lengths[i]
+            padded_sequences[i, :end] = seq[:end]
+
+        return padded_sequences, lengths
+
+    item_info = {}
+    for key in data[0].keys():
+        item_info[key] = [d[key] for d in data]
+
+    x_batch, _ = pad_sequences(item_info["X"])
+    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
+
+    return x_batch, y_batch
+
+
+def cached_collate_fn(data):
+    item_info = {}
+    for key in data[0].keys():
+        item_info[key] = [d[key] for d in data]
+
+    x_batch = torch.cat(item_info["X"], 0)
+    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
+
+    return x_batch, y_batch
+
+
+def train_epoch(data_loader, discriminator, optimizer,
+                epoch=0, log_interval=10, device='cpu'):
+    samples_so_far = 0
+    discriminator.train_custom()
+    for batch_idx, (input_t, target_t) in enumerate(data_loader):
+        input_t, target_t = input_t.to(device), target_t.to(device)
+
+        optimizer.zero_grad()
+
+        output_t = discriminator(input_t)
+        loss = F.nll_loss(output_t, target_t)
+        loss.backward(retain_graph=True)
+        optimizer.step()
+
+        samples_so_far += len(input_t)
+
+        if batch_idx % log_interval == 0:
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch + 1,
+                    samples_so_far, len(data_loader.dataset),
+                    100 * samples_so_far / len(data_loader.dataset), loss.item()
+                )
+            )
+
+
+def evaluate_performance(data_loader, discriminator, device='cpu'):
+    discriminator.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for input_t, target_t in data_loader:
+            input_t, target_t = input_t.to(device), target_t.to(device)
+            output_t = discriminator(input_t)
+            # sum up batch loss
+            test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
+            # get the index of the max log-probability
+            pred_t = output_t.argmax(dim=1, keepdim=True)
+            correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
+
+    test_loss /= len(data_loader.dataset)
+
+    print(
+        "Performance on test set: "
+        "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
+            test_loss, correct, len(data_loader.dataset),
+            100. * correct / len(data_loader.dataset)
+        )
+    )
+
+
+def predict(input_sentence, model, classes, cached=False, device='cpu'):
+    input_t = model.tokenizer.encode(input_sentence)
+    input_t = torch.tensor([input_t], dtype=torch.long, device=device)
+    if cached:
+        input_t = model.avg_representation(input_t)
+
+    log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
+    print("Input sentence:", input_sentence)
+    print("Predictions:", ", ".join(
+        "{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
+        zip(classes, log_probs)
+    ))
+
+
+def get_cached_data_loader(dataset, batch_size, discriminator,
+                           shuffle=False, device='cpu'):
+    data_loader = torch.utils.data.DataLoader(dataset=dataset,
+                                              batch_size=batch_size,
+                                              collate_fn=collate_fn)
+
+    xs = []
+    ys = []
+    for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
+        with torch.no_grad():
+            x = x.to(device)
+            avg_rep = discriminator.avg_representation(x).cpu().detach()
+            avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
+            xs += avg_rep_list
+            ys += y.cpu().numpy().tolist()
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset=Dataset(xs, ys),
+        batch_size=batch_size,
+        shuffle=shuffle,
+        collate_fn=cached_collate_fn)
+
+    return data_loader
+
+
+def train_discriminator(
+        dataset, dataset_fp=None, pretrained_model="gpt2-medium",
+        epochs=10, batch_size=64, log_interval=10,
+        save_model=False, cached=False, no_cuda=False):
+    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
+
+    print("Preprocessing {} dataset...".format(dataset))
+    start = time.time()
+
+    if dataset == "SST":
+        idx2class = ["positive", "negative", "very positive", "very negative",
+                     "neutral"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class),
+            pretrained_model=pretrained_model,
+            cached_mode=cached,
+            device=device
+        ).to(device)
+
+        text = torchtext_data.Field()
+        label = torchtext_data.Field(sequential=False)
+        train_data, val_data, test_data = datasets.SST.splits(
+            text,
+            label,
+            fine_grained=True,
+            train_subtrees=True,
+        )
+
+        x = []
+        y = []
+        for i in trange(len(train_data), ascii=True):
+            seq = TreebankWordDetokenizer().detokenize(
+                vars(train_data[i])["text"]
+            )
+            seq = discriminator.tokenizer.encode(seq)
+            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+            x.append(seq)
+            y.append(class2idx[vars(train_data[i])["label"]])
+        train_dataset = Dataset(x, y)
+
+        test_x = []
+        test_y = []
+        for i in trange(len(test_data), ascii=True):
+            seq = TreebankWordDetokenizer().detokenize(
+                vars(test_data[i])["text"]
+            )
+            seq = discriminator.tokenizer.encode(seq)
+            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+            test_x.append(seq)
+            test_y.append(class2idx[vars(test_data[i])["label"]])
+        test_dataset = Dataset(test_x, test_y)
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 2,
+        }
+
+    elif dataset == "clickbait":
+        idx2class = ["non_clickbait", "clickbait"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class),
+            pretrained_model=pretrained_model,
+            cached_mode=cached,
+            device=device
+        ).to(device)
+
+        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
+            data = []
+            for i, line in enumerate(f):
+                try:
+                    data.append(eval(line))
+                except:
+                    print("Error evaluating line {}: {}".format(
+                        i, line
+                    ))
+                    continue
+        x = []
+        y = []
+        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
+            for i, line in enumerate(tqdm(f, ascii=True)):
+                try:
+                    d = eval(line)
+                    seq = discriminator.tokenizer.encode(d["text"])
+
+                    if len(seq) < max_length_seq:
+                        seq = torch.tensor(
+                            [50256] + seq, device=device, dtype=torch.long
+                        )
+                    else:
+                        print("Line {} is longer than maximum length {}".format(
+                            i, max_length_seq
+                        ))
+                        continue
+                    x.append(seq)
+                    y.append(d["label"])
+                except:
+                    print("Error evaluating / tokenizing"
+                          " line {}, skipping it".format(i))
+                    pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(
+            full_dataset, [train_size, test_size]
+        )
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 1,
+        }
+
+    elif dataset == "toxic":
+        idx2class = ["non_toxic", "toxic"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class),
+            pretrained_model=pretrained_model,
+            cached_mode=cached,
+            device=device
+        ).to(device)
+
+        x = []
+        y = []
+        with open("datasets/toxic/toxic_train.txt") as f:
+            for i, line in enumerate(tqdm(f, ascii=True)):
+                try:
+                    d = eval(line)
+                    seq = discriminator.tokenizer.encode(d["text"])
+
+                    if len(seq) < max_length_seq:
+                        seq = torch.tensor(
+                            [50256] + seq, device=device, dtype=torch.long
+                        )
+                    else:
+                        print("Line {} is longer than maximum length {}".format(
+                            i, max_length_seq
+                        ))
+                        continue
+                    x.append(seq)
+                    y.append(int(np.sum(d["label"]) > 0))
+                except:
+                    print("Error evaluating / tokenizing"
+                          " line {}, skipping it".format(i))
+                    pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(
+            full_dataset, [train_size, test_size]
+        )
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 0,
+        }
+
+    else:  # if dataset == "generic":
+        # This assumes the input dataset is a TSV with the following structure:
+        # class \t text
+
+        if dataset_fp is None:
+            raise ValueError("When generic dataset is selected, "
+                             "dataset_fp needs to be specified aswell.")
+
+        classes = set()
+        with open(dataset_fp) as f:
+            csv_reader = csv.reader(f, delimiter="\t")
+            for row in tqdm(csv_reader, ascii=True):
+                if row:
+                    classes.add(row[0])
+
+        idx2class = sorted(classes)
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class),
+            pretrained_model=pretrained_model,
+            cached_mode=cached,
+            device=device
+        ).to(device)
+
+        x = []
+        y = []
+        with open(dataset_fp) as f:
+            csv_reader = csv.reader(f, delimiter="\t")
+            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
+                if row:
+                    label = row[0]
+                    text = row[1]
+
+                    try:
+                        seq = discriminator.tokenizer.encode(text)
+                        if (len(seq) < max_length_seq):
+                            seq = torch.tensor(
+                                [50256] + seq,
+                                device=device,
+                                dtype=torch.long
+                            )
+
+                        else:
+                            print(
+                                "Line {} is longer than maximum length {}".format(
+                                    i, max_length_seq
+                                ))
+                            continue
+
+                        x.append(seq)
+                        y.append(class2idx[label])
+
+                    except:
+                        print("Error tokenizing line {}, skipping it".format(i))
+                        pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(
+            full_dataset,
+            [train_size, test_size]
+        )
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 0,
+        }
+
+    end = time.time()
+    print("Preprocessed {} data points".format(
+        len(train_dataset) + len(test_dataset))
+    )
+    print("Data preprocessing took: {:.3f}s".format(end - start))
+
+    if cached:
+        print("Building representation cache...")
+
+        start = time.time()
+
+        train_loader = get_cached_data_loader(
+            train_dataset, batch_size, discriminator,
+            shuffle=True, device=device
+        )
+
+        test_loader = get_cached_data_loader(
+            test_dataset, batch_size, discriminator, device=device
+        )
+
+        end = time.time()
+        print("Building representation cache took: {:.3f}s".format(end - start))
+
+    else:
+        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                   batch_size=batch_size,
+                                                   shuffle=True,
+                                                   collate_fn=collate_fn)
+        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
+                                                  batch_size=batch_size,
+                                                  collate_fn=collate_fn)
+
+    if save_model:
+        with open("{}_classifier_head_meta.json".format(dataset),
+                  "w") as meta_file:
+            json.dump(discriminator_meta, meta_file)
+
+    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
+
+    for epoch in range(epochs):
+        start = time.time()
+        print("\nEpoch", epoch + 1)
+
+        train_epoch(
+            discriminator=discriminator,
+            data_loader=train_loader,
+            optimizer=optimizer,
+            epoch=epoch,
+            log_interval=log_interval,
+            device=device
+        )
+        evaluate_performance(
+            data_loader=test_loader,
+            discriminator=discriminator,
+            device=device
+        )
+
+        end = time.time()
+        print("Epoch took: {:.3f}s".format(end - start))
+
+        print("\nExample prediction")
+        predict(example_sentence, discriminator, idx2class,
+                cached=cached, device=device)
+
+        if save_model:
+            # torch.save(discriminator.state_dict(),
+            #           "{}_discriminator_{}.pt".format(
+            #               args.dataset, epoch + 1
+            #               ))
+            torch.save(discriminator.get_classifier().state_dict(),
+                       "{}_classifier_head_epoch_{}.pt".format(dataset,
+                                                               epoch + 1))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Train a discriminator on top of GPT-2 representations")
+    parser.add_argument("--dataset", type=str, default="SST",
+                        choices=("SST", "clickbait", "toxic", "generic"),
+                        help="dataset to train the discriminator on."
+                             "In case of generic, the dataset is expected"
+                             "to be a TSBV file with structure: class \\t text")
+    parser.add_argument("--dataset_fp", type=str, default="",
+                        help="File path of the dataset to use. "
+                             "Needed only in case of generic datadset")
+    parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
+                        help="Pretrained model to use as encoder")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="Number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--log_interval", type=int, default=10, metavar="N",
+                        help="how many batches to wait before logging training status")
+    parser.add_argument("--save_model", action="store_true",
+                        help="whether to save the model")
+    parser.add_argument("--cached", action="store_true",
+                        help="whether to cache the input representations")
+    parser.add_argument("--no_cuda", action="store_true",
+                        help="use to turn off cuda")
+    args = parser.parse_args()
+
+    train_discriminator(**(vars(args)))
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -39,8 +39,9 @@ from transformers import (WEIGHTS_NAME,

 from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES

-from utils_glue import (compute_metrics, convert_examples_to_features,
-                        output_modes, processors)
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors

 logger = logging.getLogger(__name__)

@@ -233,6 +234,8 @@ def main():
                        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir", action='store_true',
                        help="Whether to overwrite data in output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
                        help="Don't normalize importance score by layers")

--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -22,6 +22,7 @@ import glob
 import logging
 import os
 import random
+import json

 import numpy as np
 import torch
@@ -47,9 +48,13 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer,
                                  DistilBertConfig,
                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer)
+                                  DistilBertTokenizer,
+                                  AlbertConfig,
+                                  AlbertForSequenceClassification, 
+                                  AlbertTokenizer,
+                                )

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from transformers import glue_compute_metrics as compute_metrics
 from transformers import glue_output_modes as output_modes
@@ -66,7 +71,8 @@ MODEL_CLASSES = {
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
 }


@@ -99,8 +105,9 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
+
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -158,7 +165,7 @@ def train(args, train_dataset, model, tokenizer):
                loss.backward()

            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
+            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
@@ -170,15 +177,23 @@ def train(args, train_dataset, model, tokenizer):
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
+                    logs = {}
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            eval_key = 'eval_{}'.format(key)
+                            logs[eval_key] = value
+
+                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
+                    learning_rate_scalar = scheduler.get_lr()[0]
+                    logs['learning_rate'] = learning_rate_scalar
+                    logs['loss'] = loss_scalar
                    logging_loss = tr_loss

+                    for key, value in logs.items():
+                        tb_writer.add_scalar(key, value, global_step)
+                    print(json.dumps({**logs, **{'step': global_step}}))
+
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
@@ -189,11 +204,6 @@ def train(args, train_dataset, model, tokenizer):
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

-            if args.tpu:
-                args.xla_model.optimizer_step(optimizer, barrier=True)
-                model.zero_grad()
-                global_step += 1
-
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
@@ -221,9 +231,13 @@ def evaluate(args, model, tokenizer, prefix=""):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
@@ -318,7 +332,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
+ 
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

@@ -362,7 +376,7 @@ def main():
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
@@ -393,15 +407,6 @@ def main():
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

-    parser.add_argument('--tpu', action='store_true',
-                        help="Whether to run on the TPU defined in the environment variables")
-    parser.add_argument('--tpu_ip_address', type=str, default='',
-                        help="TPU IP address if none are set in the environment variables")
-    parser.add_argument('--tpu_name', type=str, default='',
-                        help="TPU name if none are set in the environment variables")
-    parser.add_argument('--xrt_tpu_config', type=str, default='',
-                        help="XRT TPU config if none are set in the environment variables")
-
    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
@@ -435,23 +440,6 @@ def main():
        args.n_gpu = 1
    args.device = device

-    if args.tpu:
-        if args.tpu_ip_address:
-            os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address
-        if args.tpu_name:
-            os.environ["TPU_NAME"] = args.tpu_name
-        if args.xrt_tpu_config:
-            os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config
-
-        assert "TPU_IP_ADDRESS" in os.environ
-        assert "TPU_NAME" in os.environ
-        assert "XRT_TPU_CONFIG" in os.environ
-
-        import torch_xla
-        import torch_xla.core.xla_model as xm
-        args.device = xm.xla_device()
-        args.xla_model = xm
-
    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
@@ -505,7 +493,7 @@ def main():


    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -42,12 +42,13 @@ except:

 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
-                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
+                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
+                                  CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)


 logger = logging.getLogger(__name__)
@@ -58,17 +59,18 @@ MODEL_CLASSES = {
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
+    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
 }


 class TextDataset(Dataset):
-    def __init__(self, tokenizer, file_path='train', block_size=512):
+    def __init__(self, tokenizer, args, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, 'cached_lm_' + str(block_size) + '_' + filename)
+        cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)

-        if os.path.exists(cached_features_file):
+        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
@@ -99,7 +101,7 @@ class TextDataset(Dataset):


 def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
+    dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    return dataset


@@ -185,7 +187,14 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
+
    if args.fp16:
        try:
            from apex import amp
@@ -214,13 +223,37 @@ def train(args, train_dataset, model, tokenizer):
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+
    tr_loss, logging_loss = 0.0, 0.0
+
+    model_to_resize = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+    model_to_resize.resize_token_embeddings(len(tokenizer))
+
    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
+            
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
@@ -268,11 +301,17 @@ def train(args, train_dataset, model, tokenizer):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
@@ -297,9 +336,13 @@ def evaluate(args, model, tokenizer, prefix=""):

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
@@ -427,7 +470,7 @@ def main():
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    args = parser.parse_args()

-    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
+    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:

--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer, RobertaConfig,
                                  RobertaForMultipleChoice, RobertaTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from utils_multiple_choice import (convert_examples_to_features, processors)

@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu evaluate
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))

--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -33,19 +33,23 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
 from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
+from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
+from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer

 logger = logging.getLogger(__name__)

 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
    ())

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
-    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer)
+    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
 }


@@ -80,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -121,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

@@ -191,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
@@ -206,9 +215,9 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

@@ -520,3 +529,4 @@ def main():

 if __name__ == "__main__":
    main()
+
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -16,6 +16,8 @@
 """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""

 from __future__ import absolute_import, division, print_function
+from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
+from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate

 import argparse
 import logging
@@ -23,11 +25,9 @@ import os
 import random
 import glob
 import timeit
-
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from torch.utils.data.distributed import DistributedSampler

 try:
@@ -43,18 +43,12 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
                                  XLNetTokenizer,
-                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
-
-from transformers import AdamW, WarmupLinearSchedule
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
+                                  AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
+                                  )

-from utils_squad import (read_squad_examples, convert_examples_to_features,
-                         RawResult, write_predictions,
-                         RawResultExtended, write_predictions_extended)
-
-# The follwing import is the official SQuAD evaluation script (2.0).
-# You can remove it from the dependencies if you are using this script outside of the library
-# We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features

 logger = logging.getLogger(__name__)

@@ -65,7 +59,9 @@ MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
 }

 def set_seed(args):
@@ -98,14 +94,16 @@ def train(args, train_dataset, model, tokenizer):
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
@@ -128,25 +126,31 @@ def train(args, train_dataset, model, tokenizer):
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

-    global_step = 0
+    global_step = 1
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      'start_positions': batch[3],
-                      'end_positions':   batch[4]}
+
+            inputs = {
+                'input_ids':       batch[0],
+                'attention_mask':  batch[1],
+                'start_positions': batch[3],
+                'end_positions':   batch[4]
+            }
+
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+
            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[5],
-                               'p_mask':       batch[6]})
+                inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
+
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

@@ -173,8 +177,8 @@ def train(args, train_dataset, model, tokenizer):
                model.zero_grad()
                global_step += 1

+                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
@@ -183,8 +187,8 @@ def train(args, train_dataset, model, tokenizer):
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

+                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
@@ -213,46 +217,72 @@ def evaluate(args, model, tokenizer, prefix=""):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+
    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
+
    all_results = []
    start_time = timeit.default_timer()
+
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
+
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1]
-                      }
+            inputs = {
+                'input_ids':      batch[0],
+                'attention_mask': batch[1]
+            }
+            
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+
            example_indices = batch[3]
+            
+            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
-                inputs.update({'cls_index': batch[4],
-                               'p_mask':    batch[5]})
+                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
+
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
-            if args.model_type in ['xlnet', 'xlm']:
-                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(unique_id            = unique_id,
-                                           start_top_log_probs  = to_list(outputs[0][i]),
-                                           start_top_index      = to_list(outputs[1][i]),
-                                           end_top_log_probs    = to_list(outputs[2][i]),
-                                           end_top_index        = to_list(outputs[3][i]),
-                                           cls_logits           = to_list(outputs[4][i]))
+
+            output = [to_list(output[i]) for output in outputs]
+
+            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+            # models only use two.
+            if len(output) >= 5:
+                start_logits = output[0]
+                start_top_index = output[1]
+                end_logits = output[2]
+                end_top_index = output[3]
+                cls_logits = output[4]
+
+                result = SquadResult(
+                    unique_id, start_logits, end_logits, 
+                    start_top_index=start_top_index, 
+                    end_top_index=end_top_index, 
+                    cls_logits=cls_logits
+                )
+
            else:
-                result = RawResult(unique_id    = unique_id,
-                                   start_logits = to_list(outputs[0][i]),
-                                   end_logits   = to_list(outputs[1][i]))
+                start_logits, end_logits = output
+                result = SquadResult(
+                    unique_id, start_logits, end_logits
+                )
+
            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
@@ -261,84 +291,81 @@ def evaluate(args, model, tokenizer, prefix=""):
    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

+    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
-        # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
-                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        output_nbest_file, output_null_log_odds_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
-        write_predictions(examples, features, all_results, args.n_best_size,
+        predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

-    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
-                                 pred_file=output_prediction_file,
-                                 na_prob_file=output_null_log_odds_file)
-    results = evaluate_on_squad(evaluate_options)
+    # Compute the F1 and exact scores.
+    results = squad_evaluate(examples, predictions)
    return results

-
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+    input_dir = args.data_dir if args.data_dir else "."
+    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+        str(args.max_seq_length))
+    )
+
+    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features_and_dataset = torch.load(cached_features_file)
+        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(input_file=input_file,
-                                                is_training=not evaluate,
-                                                version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(examples=examples,
-                                                tokenizer=tokenizer,
-                                                max_seq_length=args.max_seq_length,
-                                                doc_stride=args.doc_stride,
-                                                max_query_length=args.max_query_length,
-                                                is_training=not evaluate,
-                                                cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
-                                                pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
-                                                cls_token_at_end=True if args.model_type in ['xlnet'] else False,
-                                                sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
+        logger.info("Creating features from dataset file at %s", input_dir)
+
+        if not args.data_dir:
+            try:
+                import tensorflow_datasets as tfds
+            except ImportError:
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
+
+            if args.version_2_with_negative:
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+
+            tfds_examples = tfds.load("squad")
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+        else:
+            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+
+        features, dataset = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+            return_dataset='pt'
+        )
+
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
+            torch.save({"features": features, "dataset": dataset}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-    if evaluate:
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_example_index, all_cls_index, all_p_mask)
-    else:
-        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_start_positions, all_end_positions,
-                                all_cls_index, all_p_mask)
-
    if output_examples:
        return dataset, examples, features
    return dataset
@@ -348,10 +375,6 @@ def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--model_type", default=None, type=str, required=True,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
@@ -360,6 +383,8 @@ def main():
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
+    parser.add_argument("--data_dir", default=None, type=str,
+                        help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
@@ -398,7 +423,7 @@ def main():
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
+                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
@@ -444,6 +469,11 @@ def main():
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

+    args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length))
+    )
+
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

@@ -533,7 +563,7 @@ def main():
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir, force_download=True)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

@@ -551,7 +581,7 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
+            model = model_class.from_pretrained(checkpoint, force_download=True)
            model.to(args.device)

            # Evaluate

--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
-# coding=utf-8
-# Copyright 2019 The HuggingFace Inc. team.
-# Copyright (c) 2019 The HuggingFace Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning seq2seq models for sequence generation."""
-
-import argparse
-import functools
-import logging
-import os
-import random
-import sys
-
-import numpy as np
-from tqdm import tqdm, trange
-import torch
-from torch.optim import Adam
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-
-from transformers import (
-    AutoTokenizer,
-    BertForMaskedLM,
-    BertConfig,
-    PreTrainedEncoderDecoder,
-    Model2Model,
-)
-
-from utils_summarization import (
-    CNNDailyMailDataset,
-    encode_for_summarization,
-    fit_to_block_size,
-    build_lm_labels,
-    build_mask,
-    compute_token_type_ids,
-)
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-
-# ------------
-# Load dataset
-# ------------
-
-
-def load_and_cache_examples(args, tokenizer):
-    dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
-    return dataset
-
-
-def collate(data, tokenizer, block_size):
-    """ List of tuple as an input. """
-    # remove the files with empty an story/summary, encode and fit to block
-    data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
-    data = [
-        encode_for_summarization(story, summary, tokenizer) for story, summary in data
-    ]
-    data = [
-        (
-            fit_to_block_size(story, block_size, tokenizer.pad_token_id),
-            fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
-        )
-        for story, summary in data
-    ]
-
-    stories = torch.tensor([story for story, summary in data])
-    summaries = torch.tensor([summary for story, summary in data])
-    encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
-    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
-    decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
-    lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
-
-    return (
-        stories,
-        summaries,
-        encoder_token_type_ids,
-        encoder_mask,
-        decoder_mask,
-        lm_labels,
-    )
-
-
-# ----------
-# Optimizers
-# ----------
-
-
-class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
-
-    As described in [1], the authors fine-tune BertSum for abstractive
-    summarization using two Adam Optimizers with different warm-up steps and
-    learning rate. They also use a custom learning rate scheduler.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    """
-
-    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
-        self.encoder = model.encoder
-        self.decoder = model.decoder
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-
-        self.optimizers = {
-            "encoder": Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-            "decoder": Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-        }
-
-        self._step = 0
-
-    def _update_rate(self, stack):
-        return self.lr[stack] * min(
-            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-0.5)
-        )
-
-    def zero_grad(self):
-        self.optimizer_decoder.zero_grad()
-        self.optimizer_encoder.zero_grad()
-
-    def step(self):
-        self._step += 1
-        for stack, optimizer in self.optimizers.items():
-            new_rate = self._update_rate(stack)
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = new_rate
-            optimizer.step()
-
-
-# ------------
-# Train
-# ------------
-
-
-def train(args, model, tokenizer):
-    """ Fine-tune the pretrained model on the corpus. """
-    set_seed(args)
-
-    # Load the data
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_dataset = load_and_cache_examples(args, tokenizer)
-    train_sampler = RandomSampler(train_dataset)
-    model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
-    train_dataloader = DataLoader(
-        train_dataset,
-        sampler=train_sampler,
-        batch_size=args.train_batch_size,
-        collate_fn=model_collate_fn,
-    )
-
-    # Training schedule
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = t_total // (
-            len(train_dataloader) // args.gradient_accumulation_steps + 1
-        )
-    else:
-        t_total = (
-            len(train_dataloader)
-            // args.gradient_accumulation_steps
-            * args.num_train_epochs
-        )
-
-    # Prepare the optimizer
-    lr = {"encoder": 0.002, "decoder": 0.2}
-    warmup_steps = {"encoder": 20000, "decoder": 10000}
-    optimizer = BertSumOptimizer(model, lr, warmup_steps)
-
-    # Train
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info(
-        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
-    )
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size * args.gradient_accumulation_steps
-        # * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    model.zero_grad()
-    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
-
-    global_step = 0
-    tr_loss = 0.0
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
-        for step, batch in enumerate(epoch_iterator):
-            source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
-
-            source = source.to(args.device)
-            target = target.to(args.device)
-            encoder_token_type_ids = encoder_token_type_ids.to(args.device)
-            encoder_mask = encoder_mask.to(args.device)
-            decoder_mask = decoder_mask.to(args.device)
-            lm_labels = lm_labels.to(args.device)
-
-            model.train()
-            outputs = model(
-                source,
-                target,
-                encoder_token_type_ids=encoder_token_type_ids,
-                encoder_attention_mask=encoder_mask,
-                decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
-            )
-
-            loss = outputs[0]
-            print(loss)
-            if args.gradient_accumulation_steps > 1:
-                loss /= args.gradient_accumulation_steps
-
-            loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                model.zero_grad()
-                global_step += 1
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    return global_step, tr_loss / global_step
-
-
-# ------------
-# Train
-# ------------
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    set_seed(args)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
-    )
-
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    model.eval()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
-
-        source = source.to(args.device)
-        target = target.to(args.device)
-        encoder_token_type_ids = encoder_token_type_ids.to(args.device)
-        encoder_mask = encoder_mask.to(args.device)
-        decoder_mask = decoder_mask.to(args.device)
-        lm_labels = lm_labels.to(args.device)
-
-        with torch.no_grad():
-            outputs = model(
-                source,
-                target,
-                encoder_token_type_ids=encoder_token_type_ids,
-                encoder_attention_mask=encoder_mask,
-                decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
-            )
-            lm_loss = outputs[0]
-            eval_loss += lm_loss.mean().item()
-        nb_eval_steps += 1
-
-    eval_loss = eval_loss / nb_eval_steps
-    perplexity = torch.exp(torch.tensor(eval_loss))
-
-    result = {"perplexity": perplexity}
-
-    # Save the evaluation's results
-    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input training data file (a text file).",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Optional parameters
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--do_evaluate",
-        type=bool,
-        default=False,
-        help="Run model evaluation on out-of-sample data.",
-    )
-    parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
-    parser.add_argument(
-        "--do_overwrite_output_dir",
-        type=bool,
-        default=False,
-        help="Whether to overwrite the output dir.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default="bert-base-cased",
-        type=str,
-        help="The model checkpoint to initialize the encoder and decoder's weights with.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default="bert",
-        type=str,
-        help="The decoder architecture to be fine-tuned.",
-    )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument(
-        "--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        default=10,
-        type=int,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--per_gpu_train_batch_size",
-        default=4,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
-    )
-    parser.add_argument("--seed", default=42, type=int)
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.do_overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
-                args.output_dir
-            )
-        )
-
-    # Set up training device
-    if args.to_cpu or not torch.cuda.is_available():
-        args.device = torch.device("cpu")
-        args.n_gpu = 0
-    else:
-        args.device = torch.device("cuda")
-        args.n_gpu = torch.cuda.device_count()
-
-    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    config = BertConfig.from_pretrained(args.model_name_or_path)
-    decoder_model = BertForMaskedLM(config)
-    model = Model2Model.from_pretrained(
-        args.model_name_or_path, decoder_model=decoder_model
-    )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        0,
-        args.device,
-        args.n_gpu,
-        False,
-        False,
-    )
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Train the model
-    model.to(args.device)
-    if args.do_train:
-        global_step, tr_loss = train(args, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-        if not os.path.exists(args.output_dir):
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-        torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
-
-    # Evaluate the model
-    results = {}
-    if args.do_evaluate:
-        checkpoints = []
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            encoder_checkpoint = os.path.join(checkpoint, "encoder")
-            decoder_checkpoint = os.path.join(checkpoint, "decoder")
-            model = PreTrainedEncoderDecoder.from_pretrained(
-                encoder_checkpoint, decoder_checkpoint
-            )
-            model.to(args.device)
-            results = "placeholder"
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -73,6 +73,8 @@ model.save_pretrained('./save/')

 if TASK == "mrpc":
    # Load the TensorFlow model in PyTorch for inspection
+    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # do this in real life (you can run the inference on the TF model).
    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)

    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task