Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc

982f181a · erenup · 603b470a · 84b9d1c4 · 982f181a · 982f181a
Commit 982f181a authored Sep 16, 2019 by erenup
20 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,8 +4,8 @@ jobs:
        working_directory: ~/pytorch-transformers
        docker:
            - image: circleci/python:3.5
-        resource_class: large
-        parallelism: 4
+        resource_class: xlarge
+        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install --progress-bar off .
@@ -17,7 +17,7 @@ jobs:
    build_py2:
        working_directory: ~/pytorch-transformers
        resource_class: large
-        parallelism: 4
+        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
@@ -26,9 +26,28 @@ jobs:
            - run: sudo pip install pytest codecov pytest-cov
            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
            - run: codecov
+    deploy_doc:
+        working_directory: ~/pytorch-transformers
+        docker:
+            - image: circleci/python:3.5
+        steps:
+            - add_ssh_keys:
+                  fingerprints:
+                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+            - checkout
+            - run: sudo pip install --progress-bar off -r docs/requirements.txt
+            - run: sudo pip install --progress-bar off -r requirements.txt
+            - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
+            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+workflow_filters: &workflow_filters
+    filters:
+        branches:
+            only:
+                - master
 workflows:
    version: 2
    build_and_test:
        jobs:
            - build_py3
            - build_py2
+            - deploy_doc: *workflow_filters
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -131,3 +131,4 @@ examples/runs

 # data
 data
+serialization_dir
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ These implementations have been tested on several datasets (see the example scri
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
+| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
@@ -68,6 +69,14 @@ It contains an example of a conversion script from a Pytorch trained Transformer
 At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
 or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!

+## Online demo
+
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
+You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
+
+> “🦄 Write with transformer is to writing what calculators are to calculus.”
+
+![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)

 ## Quick tour

@@ -95,7 +104,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
-    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
+    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples


--- a/docs/README.md
+++ b/docs/README.md
@@ -34,6 +34,13 @@ pip install recommonmark

 ## Building the documentation

+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig 
+command to generate it:
+
+```bash
+ln -s ../../examples/README.md source/examples.md
+```
+
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:

 ```bash

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -26,3 +26,4 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 urllib3==1.25.3
+sphinx-markdown-tables==0.0.9
\ No newline at end of file
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.0.0'
+release = u'1.2.0'


 # -- General configuration ---------------------------------------------------
@@ -43,7 +43,8 @@ extensions = [
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
-    'sphinx.ext.viewcode'
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables'
 ]

 # Add any paths that contain templates here, relative to this directory.

--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,6 +11,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
+7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.

 .. toctree::
    :maxdepth: 2

--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -52,6 +52,12 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT``
 If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).


+Note on model downloads (Continuous Integration or large-scale deployments)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+
+
 Do you want to run a Transformer model on a mobile device?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -2,35 +2,35 @@ DistilBERT
 ----------------------------------------------------

 ``DistilBertConfig``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: pytorch_transformers.DistilBertConfig
    :members:


 ``DistilBertTokenizer``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: pytorch_transformers.DistilBertTokenizer
    :members:


 ``DistilBertModel``
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: pytorch_transformers.DistilBertModel
    :members:


 ``DistilBertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: pytorch_transformers.DistilBertForMaskedLM
    :members:


 ``DistilBertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
    :members:

--- a/examples/README.md
+++ b/examples/README.md
+# Examples
+
+In this section a few examples are put together. All of these examples work for several models, making use of the very
+similar API between the different models.
+
+| Section                    | Description                                                                                                                                                |
+|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
+| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
+| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
+| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                     |
+
+## Language model fine-tuning
+
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
+
+Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
+to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
+are fine-tuned using a masked language modeling (MLM) loss.
+
+Before running the following example, you should get a file that contains text on which the language model will be
+fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+
+We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
+text that will be used for evaluation.
+
+### GPT-2/GPT and causal language modeling
+
+The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
+the tokenization). The loss here is that of causal language modeling.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE
+```
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
+a score of ~20 perplexity once fine-tuned on the dataset.
+
+### RoBERTa/BERT and masked language modeling
+
+The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
+as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
+pre-training: masked language modeling. 
+
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
+slightly slower (over-fitting takes more epochs).
+
+We use the `--mlm` flag so that the script may change its loss function.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=roberta \
+    --model_name_or_path=roberta-base \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm
+```
+
+## Language generation
+
+Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
+A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
+can try out the different models available in the library.
+
+Example usage:
+
+```bash
+python run_generation.py \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2
+```
+
+## GLUE
+
+Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+
+Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
+Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
+
+GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
+uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8  V100 GPUs with a total train
+batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
+between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
+
+| Task  | Metric                       | Result      |
+|-------|------------------------------|-------------|
+| CoLA  | Matthew's corr               | 55.75       |
+| SST-2 | Accuracy                     | 92.09       |
+| MRPC  | F1/Accuracy                  | 90.48/86.27 |
+| STS-B | Person/Spearman corr.        | 89.03/88.64 |
+| QQP   | Accuracy/F1                  | 90.92/87.72 |
+| MNLI  | Matched acc./Mismatched acc. | 83.74/84.06 |
+| QNLI  | Accuracy                     | 91.07       |
+| RTE   | Accuracy                     | 68.59       |
+| WNLI  | Accuracy                     | 43.66       |
+
+Some of these results are significantly different from the ones reported on the test set
+of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+export TASK_NAME=MRPC
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/$TASK_NAME \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
+
+The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
+In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
+output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
+
+The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
+CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
+said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
+since the data processor for each task inherits from the base class DataProcessor.
+
+### MRPC
+
+#### Fine-tuning example
+
+The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
+than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/
+```
+
+Our test ran on a few seeds with [the original implementation hyper-
+parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
+results between 84% and 88%.
+
+#### Using Apex and mixed-precision
+
+Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
+[apex](https://github.com/NVIDIA/apex), then run the following example:
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/ \
+  --fp16
+```
+
+#### Distributed training
+
+Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
+reaches F1 > 92 on MRPC.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --task_name MRPC \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MRPC/ \
+    --max_seq_length 128 \
+    --per_gpu_train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+
+```bash
+acc = 0.8823529411764706
+acc_and_f1 = 0.901702786377709
+eval_loss = 0.3418912578906332
+f1 = 0.9210526315789473
+global_step = 174
+loss = 0.07231863956341798
+```
+
+### MNLI
+
+The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --task_name mnli \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MNLI/ \
+    --max_seq_length 128 \
+    --per_gpu_train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir output_dir \
+```
+
+The results  are the following:
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+## SQuAD
+
+Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+
+#### Fine-tuning on SQuAD
+
+This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
+on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
+$SQUAD_DIR directory.
+
+* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --per_gpu_train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.52
+exact_match = 81.22
+```
+
+#### Distributed training
+
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --per_gpu_train_batch_size 24 \
+    --gradient_accumulation_steps 12
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuneds model is available as a checkpoint under the reference
+`bert-large-uncased-whole-word-masking-finetuned-squad`.
+
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -9,6 +9,12 @@ DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and l
 For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
 ).

+## Setup
+
+This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
+
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+
 ## How to use DistilBERT

 PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
@@ -68,7 +74,7 @@ python train.py \

 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.

-We highly encourage you to distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
+We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:

 ```bash
 export NODE_RANK=0
@@ -90,11 +96,11 @@ python -m torch.distributed.launch \
    train.py \
        --force \
        --n_gpu $WORLD_SIZE \
-        --data_file data/dump_concat_wiki_toronto_bk.bert-base-uncased.pickle \
-        --token_counts data/token_counts_concat_wiki_toronto_bk.bert-base-uncased.pickle \
-        --dump_path serialization_dir/with_transform/last_word
+        --data_file data/binarized_text.bert-base-uncased.pickle \
+        --token_counts data/token_counts.bert-base-uncased.pickle \
+        --dump_path serialization_dir/my_first_distillation
 ```

-**Tips** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
+**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!

 Happy distillation!
--- a/examples/distillation/dataset.py
+++ b/examples/distillation/dataset.py
@@ -77,7 +77,7 @@ class Dataset:
                    if sub_s[0] != cls_id:
                        sub_s = np.insert(sub_s, 0, cls_id)
                    if sub_s[-1] != sep_id:
-                        sub_s = np.insert(sub_s, len(sub_s), cls_id)
+                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
                    assert len(sub_s) <= max_len
                    sub_seqs.append(sub_s)


--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -17,6 +17,7 @@
 """
 import os
 import math
+import psutil
 from tensorboardX import SummaryWriter
 from tqdm import trange, tqdm
 import numpy as np
@@ -192,7 +193,7 @@ class Distiller:
        x_prob = self.token_probs[token_ids.flatten()]
        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device)
+        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
        pred_mask[tgt_ids] = 1
        pred_mask = pred_mask.view(bs, max_seq_len)

@@ -216,7 +217,7 @@ class Distiller:
        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)

-        mlm_labels[1-pred_mask] = -1
+        mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility

        return token_ids, attn_mask, mlm_labels

@@ -294,7 +295,10 @@ class Distiller:
            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
            self.end_epoch()

-        if self.is_master: logger.info('Training is finished')
+        if self.is_master:
+            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
+            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
+            logger.info('Training is finished')

    def step(self,
             input_ids: torch.tensor,
@@ -379,9 +383,9 @@ class Distiller:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
-            self.scheduler.step()
            self.optimizer.step()
            self.optimizer.zero_grad()
+            self.scheduler.step()

    def iter(self):
        """
@@ -419,6 +423,8 @@ class Distiller:
            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
        
+        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
+
    def end_epoch(self):
        """
        Finally arrived at the end of epoch (full pass on dataset).

--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
 gitpython==3.0.2
+tensorboard>=1.14.0
+tensorboardX==1.8
+psutil==5.6.3
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -21,8 +21,12 @@ import random
 import time
 import numpy as np
 from pytorch_transformers import BertTokenizer
+import logging

-from ..utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)

 def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")

--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -18,8 +18,12 @@ Preprocessing script before training DistilBERT.
 from collections import Counter
 import argparse
 import pickle
+import logging

-from utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")

--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -235,8 +235,9 @@ def main():

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
-    if args.fp16:
-        model.half()
+    # We don't need to manually call model.half() following Apex's recommend
+    # if args.fp16:
+    #     model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
@@ -257,25 +258,36 @@ def main():
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
+                                     t_total=num_train_optimization_steps)
+
    if args.fp16:
        try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
+            # from apex.optimizers import FP16_Optimizer
+            # from apex.optimizers import FusedAdam
+            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-    else:
-        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
+        # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
+        # because it's recommended for typical use by Apex. We can make it configured
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
+
+    # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
+
+    #     optimizer = FusedAdam(optimizer_grouped_parameters,
+    #                           lr=args.learning_rate,
+    #                           bias_correction=False,
+    #                           max_grad_norm=1.0)
+    #     if args.loss_scale == 0:
+    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+    #     else:
+    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+    # else:
+    #     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
@@ -304,7 +316,10 @@ def main():
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
-                    optimizer.backward(loss)
+                    # I depricate FP16_Optimizer's backward func and replace as Apex document
+                    # optimizer.backward(loss)
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
                else:
                    loss.backward()
                tr_loss += loss.item()

--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -329,6 +329,7 @@ def main():
                    doc = []
                else:
                    tokens = tokenizer.tokenize(line)
+                    if tokens:
                        doc.append(tokens)
            if doc:
                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added

--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -474,6 +474,7 @@ def main():
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))