init

c394d7d1 · “change” · c394d7d1 · c394d7d1 · c394d7d1 · c394d7d1
Commit c394d7d1 authored Sep 28, 2024 by “change”
20 changed files
--- a/examples/paraphraser/paraphrase.py
+++ b/examples/paraphraser/paraphrase.py
+#!/usr/bin/env python3 -u
+
+import argparse
+import fileinput
+import logging
+import os
+import sys
+
+from fairseq.models.transformer import TransformerModel
+
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--en2fr", required=True, help="path to en2fr model")
+    parser.add_argument(
+        "--fr2en", required=True, help="path to fr2en mixture of experts model"
+    )
+    parser.add_argument(
+        "--user-dir", help="path to fairseq examples/translation_moe/src directory"
+    )
+    parser.add_argument(
+        "--num-experts",
+        type=int,
+        default=10,
+        help="(keep at 10 unless using a different model)",
+    )
+    parser.add_argument(
+        "files",
+        nargs="*",
+        default=["-"],
+        help='input files to paraphrase; "-" for stdin',
+    )
+    args = parser.parse_args()
+
+    if args.user_dir is None:
+        args.user_dir = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),  # examples/
+            "translation_moe",
+            "src",
+        )
+        if os.path.exists(args.user_dir):
+            logging.info("found user_dir:" + args.user_dir)
+        else:
+            raise RuntimeError(
+                "cannot find fairseq examples/translation_moe/src "
+                "(tried looking here: {})".format(args.user_dir)
+            )
+
+    logging.info("loading en2fr model from:" + args.en2fr)
+    en2fr = TransformerModel.from_pretrained(
+        model_name_or_path=args.en2fr,
+        tokenizer="moses",
+        bpe="sentencepiece",
+    ).eval()
+
+    logging.info("loading fr2en model from:" + args.fr2en)
+    fr2en = TransformerModel.from_pretrained(
+        model_name_or_path=args.fr2en,
+        tokenizer="moses",
+        bpe="sentencepiece",
+        user_dir=args.user_dir,
+        task="translation_moe",
+    ).eval()
+
+    def gen_paraphrases(en):
+        fr = en2fr.translate(en)
+        return [
+            fr2en.translate(fr, inference_step_args={"expert": i})
+            for i in range(args.num_experts)
+        ]
+
+    logging.info("Type the input sentence and press return:")
+    for line in fileinput.input(args.files):
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        for paraphrase in gen_paraphrases(line):
+            print(paraphrase)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pay_less_attention_paper/README.md
+++ b/examples/pay_less_attention_paper/README.md
+# Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)
+
+This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://arxiv.org/abs/1901.10430).
+
+## Citation:
+```bibtex
+@inproceedings{wu2018pay,
+  title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
+  author = {Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli},
+  booktitle = {International Conference on Learning Representations},
+  year = {2019},
+  url = {https://arxiv.org/abs/1901.10430},
+}
+```
+
+## Translation
+
+### Pre-trained models
+For some datasets we release models without GLUs which are faster at inference.
+
+Model | Description | Dataset | Download
+---|---|---|---
+`lightconv.no_glu.iwslt14.de-en` | LightConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
+`dynamicconv.no_glu.iwslt14.de-en` | DynamicConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
+`lightconv.no_glu.wmt16.en-de` | LightConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.no_glu.wmt16.en-de` | DynamicConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt16.en-de` | LightConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.glu.wmt16.en-de` | DynamicConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt14.en-fr` | LightConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.glu.wmt14.en-fr` | DynamicConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt17.zh-en` | LightConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
+`dynamicconv.glu.wmt17.zh-en` | DynamicConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
+
+### Memory-Efficient CUDA Kernels
+
+Since the PyTorch implementations of Light/Dynamic conv are quite memory intensive, we have developed CUDA kernels that implement the light and dynamic convolution operator in a memory-efficient and performant manner. For large sequence lengths, these kernels save about 50% memory compared to the PyTorch equivalent. 
+
+To install the kernels, use the commands below. Once installed, they will automatically be used in place of the PyTorch implementations whenever a light or dynamic convolution is used.
+
+```sh
+# to install lightconv
+cd fairseq/modules/lightconv_layer
+python cuda_function_gen.py
+python setup.py install
+
+# to install dynamicconv
+cd fairseq/modules/dynamicconv_layer
+python cuda_function_gen.py
+python setup.py install
+```
+
+### Example usage (torch.hub)
+
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install sacremoses subword_nmt
+```
+
+Interactive translation via PyTorch Hub:
+```python
+import torch
+
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'lightconv.glu.wmt17.zh-en', ... ]
+
+# Load a transformer trained on WMT'16 En-De
+zh2en = torch.hub.load('pytorch/fairseq', 'lightconv.glu.wmt17.zh-en', tokenizer='moses', bpe='subword_nmt')
+
+# The underlying model is available under the *models* attribute
+assert isinstance(zh2en.models[0], fairseq.models.lightconv.LightConvModel)
+
+# Translate a sentence
+zh2en.translate('你好 世界')
+# 'Hello World'
+```
+
+Loading custom models:
+```python
+from fairseq.models.lightconv import LightConvModel
+en2fr = LightConvModel.from_pretrained(
+  '/path/to/checkpoints',
+  checkpoint_file='checkpoint_best.pt',
+  data_name_or_path='data-bin/wmt14_en_fr',
+  bpe='subword_nmt',
+  bpe_codes='data-bin/wmt14_en_fr/en.code'
+)
+en2fr.translate('Hello world!')
+# 'Bonjour le monde'
+```
+
+### Preprocessing the training datasets
+
+Please follow the instructions in [`examples/translation/README.md`](../translation/README.md) to preprocess the data.
+
+### Training and evaluation options:
+To use the model without GLU, please set `--encoder-glu 0 --decoder-glu 0`.
+For LightConv, please use `--encoder-conv-type lightweight --decoder-conv-type lightweight`, otherwise the default is DynamicConv.
+For best BLEU results, lenpen may need to be manually tuned.
+
+To use the CUDA kernels, first install the PyTorch modules using the commands
+above. Once the CUDA modules are installed, they will automatically be used
+instead of the PyTorch modules.
+
+### IWSLT14 De-En
+Training and evaluating DynamicConv (without GLU) on a GPU:
+```sh
+# Training
+SAVE="save/dynamic_conv_iwslt"
+mkdir -p $SAVE 
+CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
+    --clip-norm 0 --optimizer adam --lr 0.0005 \
+    --source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \
+    --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler inverse_sqrt \
+    --ddp-backend=legacy_ddp \
+    --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
+    --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
+    -a lightconv_iwslt_de_en --save-dir $SAVE \
+    --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 0 --decoder-glu 0
+python scripts/average_checkpoints.py --inputs $SAVE \
+    --num-epoch-checkpoints 10 --output "${SAVE}/checkpoint_last10_avg.pt"
+
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/iwslt14.tokenized.de-en --path "${SAVE}/checkpoint_last10_avg.pt" --batch-size 128 --beam 4 --remove-bpe --lenpen 1 --gen-subset test --quiet 
+```
+
+### WMT16 En-De
+Training and evaluating DynamicConv (with GLU) on WMT16 En-De using cosine scheduler on one machine with 8 V100 GPUs:
+```sh
+# Training
+SAVE="save/dynamic_conv_wmt16en2de"
+mkdir -p $SAVE
+python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
+    data-bin/wmt16_en_de_bpe32k --fp16  --log-interval 100 --no-progress-bar \
+    --max-update 30000 --share-all-embeddings --optimizer adam \
+    --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
+    --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
+    --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
+    --t-mult 1 --lr-period-updates 20000 \
+    --arch lightconv_wmt_en_de_big --save-dir $SAVE \
+    --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 1 --decoder-glu 1
+
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt16.en-de.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.5 --gen-subset test > wmt16_gen.txt
+bash scripts/compound_split_bleu.sh wmt16_gen.txt
+```
+
+### WMT14 En-Fr
+Training DynamicConv (with GLU) on WMT14 En-Fr using cosine scheduler on one machine with 8 V100 GPUs:
+```sh
+# Training
+SAVE="save/dynamic_conv_wmt14en2fr"
+mkdir -p $SAVE
+python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
+    data-bin/wmt14_en_fr --fp16  --log-interval 100 --no-progress-bar \
+    --max-update 30000 --share-all-embeddings --optimizer adam \
+    --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
+    --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
+    --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
+    --t-mult 1 --lr-period-updates 70000 \
+    --arch lightconv_wmt_en_fr_big --save-dir $SAVE \
+    --dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 1 --decoder-glu 1
+
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt14.en-fr.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.9 --gen-subset test
+```
--- a/examples/pointer_generator/README.md
+++ b/examples/pointer_generator/README.md
+# Transformer with Pointer-Generator Network
+
+This page describes the `transformer_pointer_generator` model that incorporates
+a pointing mechanism in the Transformer model that facilitates copying of input
+words to the output. This architecture is described in [Enarvi et al. (2020)](https://www.aclweb.org/anthology/2020.nlpmc-1.4/).
+
+## Background
+
+The pointer-generator network was introduced in [See et al. (2017)](https://arxiv.org/abs/1704.04368)
+for RNN encoder-decoder attention models. A similar mechanism can be
+incorporated in a Transformer model by reusing one of the many attention
+distributions for pointing. The attention distribution over the input words is
+interpolated with the normal output distribution over the vocabulary words. This
+allows the model to generate words that appear in the input, even if they don't
+appear in the vocabulary, helping especially with small vocabularies.
+
+## Implementation
+
+The mechanism for copying out-of-vocabulary words from the input has been
+implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator)
+they convey the word identities through the model in order to be able to produce
+words that appear in the input sequence but not in the vocabulary. A different
+approach was taken in the Fairseq implementation to keep it self-contained in
+the model file, avoiding any changes to the rest of the code base. Copying
+out-of-vocabulary words is possible by pre-processing the input and
+post-processing the output. This is described in detail in the next section.
+
+## Usage
+
+The training and evaluation procedure is outlined below. You can also find a
+more detailed example for the XSum dataset on [this page](README.xsum.md).
+
+##### 1. Create a vocabulary and extend it with source position markers
+
+The pointing mechanism is especially helpful with small vocabularies, if we are
+able to recover the identities of any out-of-vocabulary words that are copied
+from the input. For this purpose, the model allows extending the vocabulary with
+special tokens that can be used in place of `<unk>` tokens to identify different
+input positions. For example, the user may add `<unk-0>`, `<unk-1>`, `<unk-2>`,
+etc. to the end of the vocabulary, after the normal words. Below is an example
+of how to create a vocabulary of 10000 most common words and add 1000 input
+position markers.
+
+```bash
+vocab_size=10000
+position_markers=1000
+export LC_ALL=C
+cat train.src train.tgt |
+  tr -s '[:space:]' '\n' |
+  sort |
+  uniq -c |
+  sort -k1,1bnr -k2 |
+  head -n "$((vocab_size - 4))" |
+  awk '{ print $2 " " $1 }' >dict.pg.txt
+python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
+```
+
+##### 2. Preprocess the text data
+
+The idea is that any `<unk>` tokens in the text are replaced with `<unk-0>` if
+it appears in the first input position, `<unk-1>` if it appears in the second
+input position, and so on. This can be achieved using the `preprocess.py` script
+that is provided in this directory.
+
+##### 3. Train a model
+
+The number of these special tokens is given to the model with the
+`--source-position-markers` argument—the model simply maps all of these to the
+same word embedding as `<unk>`.
+
+The attention distribution that is used for pointing is selected using the
+`--alignment-heads` and `--alignment-layer` command-line arguments in the same
+way as with the `transformer_align` model.
+
+##### 4. Generate text and postprocess it
+
+When using the model to generate text, you want to preprocess the input text in
+the same way that training data was processed, replacing out-of-vocabulary words
+with `<unk-N>` tokens. If any of these tokens are copied to the output, the
+actual words can be retrieved from the unprocessed input text. Any `<unk-N>`
+token should be replaced with the word at position N in the original input
+sequence. This can be achieved using the `postprocess.py` script.
--- a/examples/pointer_generator/README.xsum.md
+++ b/examples/pointer_generator/README.xsum.md
+## Training a pointer-generator model on the Extreme Summarization dataset
+
+##### 1. Download the Extreme Summarization data and preprocess it
+
+Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to obtain
+the original Extreme Summarization dataset. You should have six files,
+{train,validation,test}.{document,summary}.
+
+##### 2. Create a vocabulary and extend it with source position markers
+
+```bash
+vocab_size=10000
+position_markers=1000
+export LC_ALL=C
+cat train.document train.summary |
+  tr -s '[:space:]' '\n' |
+  sort |
+  uniq -c |
+  sort -k1,1bnr -k2 |
+  head -n "$((vocab_size - 4))" |
+  awk '{ print $2 " " $1 }' >dict.pg.txt
+python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
+```
+
+This creates the file dict.pg.txt that contains the 10k most frequent words,
+followed by 1k source position markers:
+
+```
+the 4954867
+. 4157552
+, 3439668
+to 2212159
+a 1916857
+of 1916820
+and 1823350
+...
+<unk-0> 0
+<unk-1> 0
+<unk-2> 0
+<unk-3> 0
+<unk-4> 0
+...
+```
+
+##### 2. Preprocess the text data
+
+```bash
+./preprocess.py --source train.document --target train.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out train.pg.src --target-out train.pg.tgt
+./preprocess.py --source validation.document --target validation.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out valid.pg.src --target-out valid.pg.tgt
+./preprocess.py --source test.document --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out test.pg.src
+```
+
+The data should now contain `<unk-N>` tokens in place of out-of-vocabulary words.
+
+##### 3. Binarize the dataset:
+
+```bash
+fairseq-preprocess \
+  --source-lang src \
+  --target-lang tgt \
+  --trainpref train.pg \
+  --validpref valid.pg \
+  --destdir bin \
+  --workers 60 \
+  --srcdict dict.pg.txt \
+  --joined-dictionary
+```
+
+##### 3. Train a model
+
+```bash
+total_updates=20000
+warmup_updates=500
+lr=0.001
+max_tokens=4096
+update_freq=4
+pointer_layer=-2
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \
+    --user-dir examples/pointer_generator/pointer_generator_src \
+    --max-tokens "$max_tokens" \
+    --task translation \
+    --source-lang src --target-lang tgt \
+    --truncate-source \
+    --layernorm-embedding \
+    --share-all-embeddings \
+    --encoder-normalize-before \
+    --decoder-normalize-before \
+    --required-batch-size-multiple 1 \
+    --arch transformer_pointer_generator \
+    --alignment-layer "$pointer_layer" \
+    --alignment-heads 1 \
+    --source-position-markers 1000 \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
+    --clip-norm 0.1 \
+    --lr-scheduler inverse_sqrt --lr "$lr" --max-update "$total_updates" --warmup-updates "$warmup_updates" \
+    --update-freq "$update_freq" \
+    --skip-invalid-size-inputs-valid-test
+```
+
+Above we specify that our dictionary contains 1000 source position markers, and
+that we want to use one attention head from the penultimate decoder layer for
+pointing. It should run in 5.5 hours on one node with eight 32GB V100 GPUs. The
+logged messages confirm that dictionary indices above 10000 will be mapped to
+the `<unk>` embedding:
+
+```
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [src] dictionary: 11000 types
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [tgt] dictionary: 11000 types
+2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.src
+2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.tgt
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | bin valid src-tgt 11332 examples
+2020-09-24 20:43:53 | INFO | fairseq.models.transformer_pg | dictionary indices from 10000 to 10999 will be mapped to 3
+```
+
+##### 4. Summarize the test sequences
+
+```bash
+batch_size=32
+beam_size=6
+max_length=60
+length_penalty=1.0
+
+fairseq-interactive bin \
+    --user-dir examples/pointer_generator/pointer_generator_src \
+    --batch-size "$batch_size" \
+    --task translation \
+    --source-lang src --target-lang tgt \
+    --path checkpoints/checkpoint_last.pt \
+    --input test.pg.src \
+    --buffer-size 200 \
+    --max-len-a 0 \
+    --max-len-b "$max_length" \
+    --lenpen "$length_penalty" \
+    --beam "$beam_size" \
+    --skip-invalid-size-inputs-valid-test |
+    tee generate.out
+grep ^H generate.out | cut -f 3- >generate.hyp
+```
+
+Now you should have the generated sequences in `generate.hyp`. They contain
+`<unk-N>` tokens that the model has copied from the source sequence. In order to
+retrieve the original words, we need the unprocessed source sequences from
+`test.document`.
+
+##### 5. Process the generated output
+
+Since we skipped too long inputs when producing `generate.hyp`, we also have to
+skip too long sequences now that we read `test.document`.
+
+```bash
+./postprocess.py \
+    --source <(awk 'NF<1024' test.document) \
+    --target generate.hyp \
+    --target-out generate.hyp.processed
+```
+
+Now you'll find the final sequences from `generate.hyp.processed`, with
+`<unk-N>` replaced with the original word from the source sequence.
+
+##### An example of a summarized sequence
+
+The original source document in `test.document`:
+
+> de roon moved to teesside in june 2016 for an initial # 8.8 m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
+
+The preprocessed source document in `test.src.pg`:
+
+> de \<unk-1> moved to \<unk-4> in june 2016 for an initial # \<unk-12> m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
+
+The generated summary in `generate.hyp`:
+
+> middlesbrough striker \<unk> de \<unk-1> has joined spanish side \<unk> on a season-long loan .
+
+The generated summary after postprocessing in `generate.hyp.processed`:
+
+> middlesbrough striker \<unk> de roon has joined spanish side \<unk> on a season-long loan .
--- a/examples/pointer_generator/pointer_generator_src/__init__.py
+++ b/examples/pointer_generator/pointer_generator_src/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import transformer_pg  # noqa
--- a/examples/pointer_generator/pointer_generator_src/transformer_pg.py
+++ b/examples/pointer_generator/pointer_generator_src/transformer_pg.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Any, Dict, Optional, List, Tuple
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    DEFAULT_MAX_SOURCE_POSITIONS,
+    DEFAULT_MAX_TARGET_POSITIONS,
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+    base_architecture,
+)
+from torch import Tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("transformer_pointer_generator")
+class TransformerPointerGeneratorModel(TransformerModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_, augmented with a pointer-generator
+    network from `"Get To The Point: Summarization with Pointer-Generator
+    Networks" (See et al, 2017) <https://arxiv.org/abs/1704.04368>`_.
+
+    Args:
+        encoder (TransformerPointerGeneratorEncoder): the encoder
+        decoder (TransformerPointerGeneratorDecoder): the decoder
+
+    The Transformer pointer-generator model provides the following named
+    architectures and command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.transformer_pointer_generator_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        TransformerModel.add_args(parser)
+        parser.add_argument('--alignment-heads', type=int, metavar='N',
+                            help='number of attention heads to be used for '
+                                 'pointing')
+        parser.add_argument('--alignment-layer', type=int, metavar='I',
+                            help='layer number to be used for pointing (0 '
+                                 'corresponding to the bottommost layer)')
+        parser.add_argument('--source-position-markers', type=int, metavar='N',
+                            help='dictionary includes N additional items that '
+                                 'represent an OOV token at a particular input '
+                                 'position')
+        parser.add_argument('--force-generation', type=float, metavar='P',
+                            default=None,
+                            help='set the vocabulary distribution weight to P, '
+                                 'instead of predicting it from the input (1.0 '
+                                 'corresponding to generation, 0.0 to pointing)')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+        if getattr(args, "source_position_markers", None) is None:
+            args.source_position_markers = args.max_source_positions
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+        if src_dict != tgt_dict:
+            raise ValueError("Pointer-generator requires a joined dictionary")
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            # The dictionary may include additional items that can be used in
+            # place of the normal OOV token and that all map to the same
+            # embedding. Using a different token for each input position allows
+            # one to restore the word identities from the original source text.
+            num_embeddings = len(dictionary) - args.source_position_markers
+            padding_idx = dictionary.pad()
+            unk_idx = dictionary.unk()
+            logger.info(
+                "dictionary indices from {0} to {1} will be mapped to {2}".format(
+                    num_embeddings, len(dictionary) - 1, unk_idx
+                )
+            )
+            emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        if args.share_all_embeddings:
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return cls(args, encoder, decoder)
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens)
+
+
+class TransformerPointerGeneratorEncoder(TransformerEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds
+    the source tokens to the encoder output as these are otherwise not passed
+    to the decoder.
+    """
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths: Optional[Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[Tensor] = None
+    ):
+        """
+        Runs the `forward()` method of the parent Transformer class. Then adds
+        the source tokens into the encoder output tuple.
+
+        While it might be more elegant that the model would pass the source
+        tokens to the `forward()` method of the decoder too, this would require
+        changes to `SequenceGenerator`.
+
+        Args:
+            src_tokens (torch.LongTensor): tokens in the source language of
+                shape `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+
+        Returns:
+            namedtuple:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+                - **src_tokens** (Tensor): input token ids of shape
+                  `(batch, src_len)`
+        """
+        encoder_out = self.forward_scriptable(src_tokens,
+                                              src_lengths,
+                                              return_all_hiddens,
+                                              token_embeddings)
+
+        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
+        # `forward` so we use a dictionary instead.
+        # TorchScript does not support mixed values so the values are all lists.
+        # The empty list is equivalent to None.
+        return {
+            "encoder_out": encoder_out["encoder_out"],  # T x B x C
+            "encoder_padding_mask": encoder_out["encoder_padding_mask"],  # B x T
+            "encoder_embedding": encoder_out["encoder_embedding"],  # B x T x C
+            "encoder_states": encoder_out["encoder_states"],  # List[T x B x C]
+            "src_tokens": [src_tokens],  # B x T
+            "src_lengths": [],
+        }
+
+
+class TransformerPointerGeneratorDecoder(TransformerDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes
+    the output probabilities with an attention distribution in the output layer.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False)
+
+        # In the pointer-generator model these arguments define the decoder
+        # layer and the number of attention heads that will be averaged to
+        # create the alignment for pointing.
+        self.alignment_heads = args.alignment_heads
+        self.alignment_layer = args.alignment_layer
+
+        input_embed_dim = embed_tokens.embedding_dim
+
+        # Generation probabilities / interpolation coefficients are predicted
+        # from the current decoder input embedding and the decoder output, which
+        # is the size of output_embed_dim.
+        p_gen_input_size = input_embed_dim + self.output_embed_dim
+        self.project_p_gens = nn.Linear(p_gen_input_size, 1)
+        nn.init.zeros_(self.project_p_gens.bias)
+
+        # The dictionary may include a separate entry for an OOV token in each
+        # input position, so that their identity can be restored from the
+        # original source text.
+        self.num_types = len(dictionary)
+        self.num_oov_types = args.source_position_markers
+        self.num_embeddings = self.num_types - self.num_oov_types
+        self.force_p_gen = args.force_generation
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = 0,
+        alignment_heads: Optional[int] = 1,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict, optional): dictionary used for storing
+                state during :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False)
+            alignment_layer (int, optional): 0-based index of the layer to be
+                used for pointing (default: 0)
+            alignment_heads (int, optional): number of attention heads to be
+                used for pointing (default: 1)
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        # The normal Transformer model doesn't pass the alignment_layer and
+        # alignment_heads parameters correctly. We use our local variables.
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            alignment_layer=self.alignment_layer,
+            alignment_heads=self.alignment_heads,
+        )
+        if not features_only:
+            # Embedding the tokens again for generation probability prediction,
+            # so that we don't have to reimplement the whole extract_features()
+            # method.
+            if incremental_state is not None:
+                prev_output_tokens = prev_output_tokens[:, -1:]
+            prev_output_embed = self.embed_tokens(prev_output_tokens)
+            prev_output_embed *= self.embed_scale
+            predictors = torch.cat((prev_output_embed, x), 2)
+            p_gens = self.project_p_gens(predictors)
+            p_gens = torch.sigmoid(p_gens.float())
+            # Torchscript complains if encoder_out or attn are None because
+            # `output_layer()` signature expects tensors instead
+            attn: Optional[Tensor] = extra["attn"][0]
+            assert encoder_out is not None
+            assert attn is not None
+            x = self.output_layer(x, attn, encoder_out["src_tokens"][0], p_gens)
+        return x, extra
+
+    def output_layer(
+        self,
+        features: Tensor,
+        attn: Tensor,
+        src_tokens: Tensor,
+        p_gens: Tensor
+    ) -> Tensor:
+        """
+        Project features to the vocabulary size and mix with the attention
+        distributions.
+        """
+        if self.force_p_gen is not None:
+            p_gens = self.force_p_gen
+
+        # project back to size of vocabulary
+        if self.adaptive_softmax is None:
+            logits = self.output_projection(features)
+        else:
+            logits = features
+
+        batch_size = logits.shape[0]
+        output_length = logits.shape[1]
+        assert logits.shape[2] == self.num_embeddings
+        assert src_tokens.shape[0] == batch_size
+        src_length = src_tokens.shape[1]
+
+        # The final output distribution will be a mixture of the normal output
+        # distribution (softmax of logits) and attention weights.
+        gen_dists = self.get_normalized_probs_scriptable(
+            (logits, None), log_probs=False, sample=None
+        )
+        gen_dists = torch.mul(gen_dists, p_gens)
+        padding_size = (batch_size, output_length, self.num_oov_types)
+        padding = gen_dists.new_zeros(padding_size)
+        gen_dists = torch.cat((gen_dists, padding), 2)
+        assert gen_dists.shape[2] == self.num_types
+
+        # Scatter attention distributions to distributions over the extended
+        # vocabulary in a tensor of shape [batch_size, output_length,
+        # vocab_size]. Each attention weight will be written into a location
+        # that is for other dimensions the same as in the index tensor, but for
+        # the third dimension it's the value of the index tensor (the token ID).
+        attn = torch.mul(attn.float(), 1 - p_gens)
+        index = src_tokens[:, None, :]
+        index = index.expand(batch_size, output_length, src_length)
+        attn_dists_size = (batch_size, output_length, self.num_types)
+        attn_dists = attn.new_zeros(attn_dists_size)
+        attn_dists.scatter_add_(2, index, attn.float())
+
+        # Final distributions, [batch_size, output_length, num_types].
+        return gen_dists + attn_dists
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """
+        Get normalized probabilities (or log probs) from a net's output.
+        Pointer-generator network output is already normalized.
+        """
+        probs = net_output[0]
+        # Make sure the probabilities are greater than zero when returning log
+        # probabilities.
+        return probs.clamp(1e-10, 1.0).log() if log_probs else probs
+
+
+class Embedding(nn.Embedding):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings. This subclass differs from the standard PyTorch Embedding class by
+    allowing additional vocabulary entries that will be mapped to the unknown token
+    embedding.
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx`
+                           (initialized to zeros) whenever it encounters the index.
+        unk_idx (int): Maps all token indices that are greater than or equal to
+                       num_embeddings to this index.
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+                         initialized from :math:`\mathcal{N}(0, 1)`
+    Shape:
+        - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+    .. note::
+        With :attr:`padding_idx` set, the embedding vector at
+        :attr:`padding_idx` is initialized to all zeros. However, note that this
+        vector can be modified afterwards, e.g., using a customized
+        initialization method, and thus changing the vector used to pad the
+        output. The gradient for this vector from :class:`~torch.nn.Embedding`
+        is always zero.
+    """
+    __constants__ = ["unk_idx"]
+
+    # Torchscript: Inheriting from Embedding class produces an error when exporting to Torchscript
+    # -> RuntimeError: Unable to cast Python instance to C++ type (compile in debug mode for details
+    # It's happening because max_norm attribute from nn.Embedding is None by default and it cannot be
+    # cast to a C++ type
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int],
+        unk_idx: int,
+        max_norm: Optional[float] = float("inf"),
+    ):
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, max_norm=max_norm)
+        self.unk_idx = unk_idx
+        nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5)
+        nn.init.constant_(self.weight[padding_idx], 0)
+
+    def forward(self, input):
+        input = torch.where(
+            input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input
+        )
+        return nn.functional.embedding(
+            input, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse
+        )
+
+
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator"
+)
+def transformer_pointer_generator(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", -1)
+    base_architecture(args)
+    if args.alignment_layer < 0:
+        args.alignment_layer = args.decoder_layers + args.alignment_layer
+
+
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en"
+)
+def transformer_pointer_generator_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    transformer_pointer_generator(args)
+
+
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de"
+)
+def transformer_pointer_generator_wmt_en_de(args):
+    transformer_pointer_generator(args)
+
+
+# Transformer pointer-generator with the base Transformer parameters as used in
+# the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture(
+    "transformer_pointer_generator",
+    "transformer_pointer_generator_vaswani_wmt_en_de_big",
+)
+def transformer_pointer_generator_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    transformer_pointer_generator(args)
+
+
+@register_model_architecture(
+    "transformer_pointer_generator",
+    "transformer_pointer_generator_vaswani_wmt_en_fr_big",
+)
+def transformer_pointer_generator_vaswani_wmt_en_fr_big(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)
+
+
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big"
+)
+def transformer_pointer_generator_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t"
+)
+def transformer_pointer_generator_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)
--- a/examples/pointer_generator/postprocess.py
+++ b/examples/pointer_generator/postprocess.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import re
+import sys
+
+
+class OOVIndexError(IndexError):
+    def __init__(self, pos, source_seq, target_seq):
+        super(OOVIndexError, self).__init__(
+            "A <unk-N> tag in the target sequence refers to a position that is "
+            "outside the source sequence. Most likely there was a mismatch in "
+            "provided source and target sequences. Otherwise this would mean that "
+            "the pointing mechanism somehow attended to a position that is past "
+            "the actual sequence end."
+        )
+        self.source_pos = pos
+        self.source_seq = source_seq
+        self.target_seq = target_seq
+
+
+def replace_oovs(source_in, target_in, target_out):
+    """Replaces <unk-N> tokens in the target text with the corresponding word in
+    the source text.
+    """
+
+    oov_re = re.compile("^<unk-([0-9]+)>$")
+
+    for source_seq, target_seq in zip(source_in, target_in):
+        target_seq_out = []
+
+        pos_to_word = source_seq.strip().split()
+        for token in target_seq.strip().split():
+            m = oov_re.match(token)
+            if m:
+                pos = int(m.group(1))
+                if pos >= len(pos_to_word):
+                    raise OOVIndexError(pos, source_seq, target_seq)
+                token_out = pos_to_word[pos]
+            else:
+                token_out = token
+            target_seq_out.append(token_out)
+        target_out.write(" ".join(target_seq_out) + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replaces <unk-N> tokens in target sequences with words from "
+        "the corresponding position in the source sequence."
+    )
+    parser.add_argument(
+        "--source", type=str, help="text file with source sequences", required=True
+    )
+    parser.add_argument(
+        "--target", type=str, help="text file with target sequences", required=True
+    )
+    parser.add_argument(
+        "--target-out",
+        type=str,
+        help="where to write target sequences without <unk-N> " "entries",
+        required=True,
+    )
+    args = parser.parse_args()
+
+    target_in = (
+        open(args.target, "r", encoding="utf-8") if args.target is not None else None
+    )
+    target_out = (
+        open(args.target_out, "w", encoding="utf-8")
+        if args.target_out is not None
+        else None
+    )
+    with open(args.source, "r", encoding="utf-8") as source_in, open(
+        args.target, "r", encoding="utf-8"
+    ) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out:
+        replace_oovs(source_in, target_in, target_out)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except OOVIndexError as e:
+        print(e, file=sys.stderr)
+        print("Source sequence:", e.source_seq.strip(), file=sys.stderr)
+        print("Target sequence:", e.target_seq.strip(), file=sys.stderr)
+        print(
+            "Source sequence length:",
+            len(e.source_seq.strip().split()),
+            file=sys.stderr,
+        )
+        print("The offending tag points to:", e.source_pos)
+        sys.exit(2)
--- a/examples/pointer_generator/preprocess.py
+++ b/examples/pointer_generator/preprocess.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from itertools import zip_longest
+
+
+def replace_oovs(source_in, target_in, vocabulary, source_out, target_out):
+    """Replaces out-of-vocabulary words in source and target text with <unk-N>,
+    where N in is the position of the word in the source sequence.
+    """
+
+    def format_unk(pos):
+        return "<unk-{}>".format(pos)
+
+    if target_in is None:
+        target_in = []
+
+    for seq_num, (source_seq, target_seq) in enumerate(
+        zip_longest(source_in, target_in)
+    ):
+        source_seq_out = []
+        target_seq_out = []
+
+        word_to_pos = dict()
+        for position, token in enumerate(source_seq.strip().split()):
+            if token in vocabulary:
+                token_out = token
+            else:
+                if token in word_to_pos:
+                    oov_pos = word_to_pos[token]
+                else:
+                    word_to_pos[token] = position
+                    oov_pos = position
+                token_out = format_unk(oov_pos)
+            source_seq_out.append(token_out)
+        source_out.write(" ".join(source_seq_out) + "\n")
+
+        if target_seq is not None:
+            for token in target_seq.strip().split():
+                if token in word_to_pos:
+                    token_out = format_unk(word_to_pos[token])
+                else:
+                    token_out = token
+                target_seq_out.append(token_out)
+        if target_out is not None:
+            target_out.write(" ".join(target_seq_out) + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replaces out-of-vocabulary words in both source and target "
+        "sequences with tokens that indicate the position of the word "
+        "in the source sequence."
+    )
+    parser.add_argument(
+        "--source", type=str, help="text file with source sequences", required=True
+    )
+    parser.add_argument(
+        "--target", type=str, help="text file with target sequences", default=None
+    )
+    parser.add_argument("--vocab", type=str, help="vocabulary file", required=True)
+    parser.add_argument(
+        "--source-out",
+        type=str,
+        help="where to write source sequences with <unk-N> entries",
+        required=True,
+    )
+    parser.add_argument(
+        "--target-out",
+        type=str,
+        help="where to write target sequences with <unk-N> entries",
+        default=None,
+    )
+    args = parser.parse_args()
+
+    with open(args.vocab, encoding="utf-8") as vocab:
+        vocabulary = vocab.read().splitlines()
+
+    target_in = (
+        open(args.target, "r", encoding="utf-8") if args.target is not None else None
+    )
+    target_out = (
+        open(args.target_out, "w", encoding="utf-8")
+        if args.target_out is not None
+        else None
+    )
+    with open(args.source, "r", encoding="utf-8") as source_in, open(
+        args.source_out, "w", encoding="utf-8"
+    ) as source_out:
+        replace_oovs(source_in, target_in, vocabulary, source_out, target_out)
+    if target_in is not None:
+        target_in.close()
+    if target_out is not None:
+        target_out.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/quant_noise/README.md
+++ b/examples/quant_noise/README.md
+# Training with Quantization Noise for Extreme Model Compression ({Fan\*, Stock\*} *et al.*, 2020)
+This page contains information for how to train and quantize models with Quantization Noise, for both scalar quantization like `int8` and Iterative Product Quantization.
+Check out our paper [here](https://arxiv.org/abs/2004.07320).
+
+Looking for pretrained models? They will be added shortly.
+Looking for code to train vision models? We are working on open sourcing our code as part of ClassyVision. Please check back, but note that both the Scalar and Iterative Product Quantization counterparts of the `nn.Conv2d` module are already included in this release.
+
+**Contents**:
+- [Walk through of code](#walk-through-the-code)
+- [Reproduce NLP Results](#looking-to-reproduce-the-nlp-results-in-the-paper)
+- [Reproduce Vision Results](#looking-to-reproduce-the-vision-results-in-the-paper)
+
+
+## Citation
+```bibtex
+@article{fan2020training,
+    title={Training with Quantization Noise for Extreme Model Compression},
+    author={Angela Fan* and Pierre Stock* and and Benjamin Graham and Edouard Grave and Remi Gribonval and Herve Jegou and Armand Joulin},
+    year={2020},
+    eprint={2004.07320},
+    archivePrefix={arXiv},
+    primaryClass={cs.ML}
+}
+```
+
+## Walk through the code
+
+Training a model with Quant-Noise improves the performance in subsequent inference-time quantization by training models to be robust to quantization. This technique is useful for both scalar and product quantization methods, as well as multiple domains. We detail below our approach to train, quantize models and integrate our code to quantize your favorite models.
+
+### Scalar Quantization
+
+Unlike the section [Iterative Product Quantization](#iterative-product-quantization) which gives state-of-the-art compression, this section showcases the usefulness of our approach for simple scalar quantization baselines such as int8 using on-GPU Fake Quantization.
+
+#### Training
+
+Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization).
+
+To train a model with Quant-Noise, add the following flag:
+```
+--quant-noise-scalar 0.5
+```
+Large values of noise make the network easier to quantize but may result in higher non-quantized test and validation perplexities.
+
+#### Quantization
+
+When evaluating a network, all quantized modules and activation hooks automatically switch to `p=1` so the validation accuracy reported by Fairseq is actually the quantized one, nothing more to do.
+
+
+#### Integration with your own code
+
+Looking to quantize your own models with Quant-Noise + Scalar Quantization?
+- Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations.
+- Then, perform your training as usual. Note that in `eval()` mode, the network is always fully quantized (weights and activations) by default (`p=1`).
+
+
+
+### Iterative Product Quantization
+
+
+Iterative Product Quantization with Quant-Noise proceeds in two steps. First, a model must be trained uncompressed with Quant-Noise. Second, the model must be quantized with iPQ. Note that we implement here the simplest form of noise, which consists in randomly dropping a proportion `p` of blocks, and that worked as well as assigning those blocks to their current centroid.
+
+#### Training
+
+To train a model with Quant-Noise, add the following flags:
+```
+--quant-noise-pq 0.1 --quant-noise-pq-block-size 8
+```
+`quant-noise-pq` controls how much dropout is applied to the blocks of the weight matrix. `quant-noise-pq-block-size` controls the size of the weight matrix blocks.
+We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy.
+
+We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks.
+In the Transformer architectures, quant-noise is applied to the input and output embeddings, the attention, and the FFN.
+
+Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/master/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2.
+
+#### Quantization
+
+We implement an improved version of product quantization from Stock et al, **iPQ**, described [here](https://arxiv.org/abs/1907.05686), see code with old API [here](https://github.com/facebookresearch/kill-the-bits). Note that we improved the iPQ API in terms of both compute speed and usability as described below.
+
+For the particular case of PQ, quantization is made sequentially. We recommend first quantizing the FFNs, then the EMBs, and finally the ATTNs. Quantization is done in two sub-steps:
+- First, perform `n` steps of Product Quantization (generally `n=20` is enough).
+- Then, finetune the obtained centroids.
+
+#### Integration with your own code
+
+Looking to quantize your own models with Quant-Noise + iPQ?
+- First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model.
+- Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration.
+Note that we tried our approach only on Transformers and various Convolutional Models such as EfficientNets.
+
+```python
+from fairseq.modules.quantization.pq import quantize_model_, SizeTracker
+
+# get configuration parameters
+n_centroids_config = config["n_centroids"]
+block_sizes_config = config["block_sizes"]
+layers_to_quantize = config["layers_to_quantize"]
+
+# size tracker for keeping track of assignments, centroids and non-compressed sizes
+size_tracker = SizeTracker(model)
+
+# Quantize model by stages
+for step in range(len(layers_to_quantize)):
+
+    # quantize model in-place
+    quantized_layers = quantize_model_(
+        model,
+        size_tracker,
+        layers_to_quantize,
+        block_sizes_config,
+        n_centroids_config,
+        step=step,
+    )
+    logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
+    logger.info(f"{size_tracker}")
+
+    # Don't forget to re-create/update trainer/optimizer since model parameters have changed
+    optimizer = ...
+
+    # Finetune the centroids with your usual training loop for a few epochs
+    trainer.train_epoch()
+```
+
+
+## Looking to reproduce the NLP results in the paper?
+
+We detail below how to reproduce the state-of-the-art results in reported in the paper for Quant-Noise + Iterative Product Quantization.
+
+### Training with Quant-Noise
+
+To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
+The following command can be used to train a RoBERTa Base + QuantNoise model:
+
+```bash
+TOTAL_UPDATES=125000
+WARMUP_UPDATES=10000
+PEAK_LR=0.0005
+TOKENS_PER_SAMPLE=512
+MAX_POSITIONS=512
+MAX_SENTENCES=16
+UPDATE_FREQ=2
+DATA_DIR=/path/to/data/here
+
+fairseq-train $DATA_DIR \
+    --task masked_lm --criterion masked_lm --arch roberta_base \
+    --sample-break-mode complete \
+    --tokens-per-sample $TOKENS_PER_SAMPLE --max-positions $MAX_POSITIONS \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $PEAK_LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.01 \
+    --batch-size $MAX_SENTENCES \
+    --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \
+    --save-dir checkpoint/roberta \
+    --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \
+    --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta
+```
+
+To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md).
+The following command can be used to finetune a RoBERTa Base + QuantNoise model on the RTE dataset:
+
+```bash
+TOTAL_NUM_UPDATES=2036
+WARMUP_UPDATES=122
+LR=2e-05
+NUM_CLASSES=2
+MAX_SENTENCES=16
+ROBERTA_PATH=/path/to/roberta_quantnoise/model.pt
+
+fairseq-train /path/to/rte/data/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --ddp-backend legacy_ddp \
+    --quant-noise-pq 0.2 --quant-noise-pq-block-size 8
+```
+
+To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model).
+The following command can be used to train a Transformer + QuantNoise model on Wikitext-103:
+
+```bash
+fairseq-train --task language_modeling /path/to/wikitext-103/data \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
+    --adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
+    --tie-adaptive-proj --tie-adaptive-weights \
+    --arch transformer_lm_gbw \
+    --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
+    --clip-norm 0.1 --criterion adaptive_loss \
+    --ddp-backend legacy_ddp \
+    --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \
+    --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
+    --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \
+    --max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \
+    --sample-break-mode none --update-freq 3 \
+    --warmup-init-lr 1e-07 --warmup-updates 16000 \
+    --weight-decay 0 --seed 1 --stop-min-lr 1e-09 \
+    --quant-noise-pq 0.05 --quant-noise-pq-block-size 8
+```
+
+To **evaluate** this model, note you need to use the `eval.py` script. The following command can be used to evaluate:
+
+```bash
+fairseq-eval-lm /path/to/wikitext-103/data --path /path/to/model/checkpoint \
+    --sample-break-mode complete \
+    --max-tokens 3072 \
+    --context-window 2560 \
+    --softmax-batch 1024 \
+    --gen-subset valid
+```
+and change the `--gen-subset` to `test` if you would like to evaluate on the test set instead.
+
+
+### Iterative Product Quantization
+
+To quantize the finetuned RoBERTa model, we use this command on 1 GPU. This should run in a day.
+```bash
+TOTAL_NUM_UPDATES=6108  # 2036 updates for each iteration
+WARMUP_UPDATES=122
+LR=2e-05
+NUM_CLASSES=2
+MAX_SENTENCES=16
+fairseq-train --task sentence_prediction /path/to/data/ \
+    --restore-file $ROBERTA_PATH \
+    --save-dir checkpoints/roberta_finetuned \
+    --max-positions 512 \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 --lr-scheduler polynomial_decay \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \
+    --quantization-config-path /path/to/config/yaml
+```
+
+To quantize the trained Language Model, we use this command on 8 V100 23GB GPUs. This should run in a couple of hours.
+```bash
+fairseq-train --task language_modeling /path/to/wikitext-103/data \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
+    --adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
+    --arch transformer_lm_gbw \
+    --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1  \
+    --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \
+    --clip-norm 0.1 --criterion adaptive_loss \
+    --ddp-backend legacy_ddp \
+    --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
+    --fp16 --keep-last-epochs -1 \
+    --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \
+    --max-tokens 2944  --tokens-per-sample 2944\
+    --momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \
+    --sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \
+    --tie-adaptive-proj --tie-adaptive-weights --update-freq 3 --weight-decay 0 --seed 1  \
+    --log-interval 100 --no-progress-bar --skip-invalid-size-inputs-valid-test \
+    --restore-file path/to/trained/lm/with/quant/noise \
+    --max-update 13500 --quantization-config-path /path/to/config/yaml
+```
+If you have less capacity or if your distributed training freezes, try reducing  `--max-tokens` and  `--tokens-per-sample` (this may reduce the quantized accuracy a bit).
+
+### Remarks
+
+We try to keep the open-sourced code as readable and as easy-to-plug as possible. Therefore, we did not test it for the following cases:
+- Scalar quantization with RoBERTa.
+- Quantization with iPQ and `int8` combined.
+
+If you have trouble adapting it, we will be more than happy to help!
+
+## Looking to reproduce the Vision results in the paper?
+
+We are working on open sourcing our code as part of ClassyVision. Please check back.
+
+
+## Having an issue or have a question?
+
+Please open an issue in this repository with the details of your question. Thanks!
--- a/examples/quant_noise/transformer_quantization_config.yaml
+++ b/examples/quant_noise/transformer_quantization_config.yaml
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file defines example configuration arguments for quantizing
+# a transformer model with product quantization
+
+# Number of Centroids for Product Quantization, by default 256 (byte-aligned)
+n_centroids:
+    Linear:
+        key: in_features
+        value: {"*": 256}
+    Embedding:
+        key: embedding_dim
+        value: {"*": 256}
+
+# Block Sizes for Product Quantization
+# We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings
+block_sizes:
+  Linear:
+      key: fuzzy_name
+      value: {fc: 8, attn: 4, emb: 4}
+  Embedding:
+      key: fuzzy_name
+      value: {emb: 8}
+
+# Layers to Quantize Sequentially
+# We suggest: first FFN, then EMB, then ATTN
+layers_to_quantize:
+    - decoder\\.layers\\.\d+\\.fc[12]
+    - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]
+    - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)
--- a/examples/roberta/README.custom_classification.md
+++ b/examples/roberta/README.custom_classification.md
+# Finetuning RoBERTa on a custom classification task
+
+This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks.
+
+### 1) Get the data
+
+```bash
+wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar zxvf aclImdb_v1.tar.gz
+```
+
+
+### 2) Format data
+
+`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.  
+```python
+import argparse
+import os
+import random
+from glob import glob
+
+random.seed(0)
+
+def main(args):
+    for split in ['train', 'test']:
+        samples = []
+        for class_label in ['pos', 'neg']:
+            fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt')
+            for fname in fnames:
+                with open(fname) as fin:
+                    line = fin.readline()
+                    samples.append((line, 1 if class_label == 'pos' else 0))
+        random.shuffle(samples)
+        out_fname = 'train' if split == 'train' else 'dev'
+        f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w')
+        f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w')
+        for sample in samples:
+            f1.write(sample[0] + '\n')
+            f2.write(str(sample[1]) + '\n')
+        f1.close()
+        f2.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--datadir', default='aclImdb')
+    args = parser.parse_args()
+    main(args)
+```
+
+
+### 3) BPE encode
+
+Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
+```bash
+# Download encoder.json and vocab.bpe
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+
+for SPLIT in train dev; do
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json encoder.json \
+        --vocab-bpe vocab.bpe \
+        --inputs "aclImdb/$SPLIT.input0" \
+        --outputs "aclImdb/$SPLIT.input0.bpe" \
+        --workers 60 \
+        --keep-empty
+done
+```
+
+
+### 4) Preprocess data
+
+```bash
+# Download fairseq dictionary.
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'  
+
+fairseq-preprocess \
+    --only-source \
+    --trainpref "aclImdb/train.input0.bpe" \
+    --validpref "aclImdb/dev.input0.bpe" \
+    --destdir "IMDB-bin/input0" \
+    --workers 60 \
+    --srcdict dict.txt
+
+fairseq-preprocess \
+    --only-source \
+    --trainpref "aclImdb/train.label" \
+    --validpref "aclImdb/dev.label" \
+    --destdir "IMDB-bin/label" \
+    --workers 60
+
+```
+
+
+### 5) Run training
+
+```bash
+TOTAL_NUM_UPDATES=7812  # 10 epochs through IMDB for bsz 32
+WARMUP_UPDATES=469      # 6 percent of the number of updates
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+HEAD_NAME=imdb_head     # Custom name for the classification head.
+NUM_CLASSES=2           # Number of classes for the classification task.
+MAX_SENTENCES=8         # Batch size.
+ROBERTA_PATH=/path/to/roberta.large/model.pt
+
+CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --classification-head-name $HEAD_NAME \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --shorten-method "truncate" \
+    --find-unused-parameters \
+    --update-freq 4
+```
+
+The above command will finetune RoBERTa-large with an effective batch-size of 32
+sentences (`--batch-size=8 --update-freq=4`). The expected
+`best-validation-accuracy` after 10 epochs is ~96.5%.
+
+If you run out of GPU memory, try decreasing `--batch-size` and increase
+`--update-freq` to compensate.
+
+
+### 6) Load model using hub interface
+
+Now we can load the trained model checkpoint using the RoBERTa hub interface.
+
+Assuming your checkpoints are stored in `checkpoints/`:
+```python
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained(
+    'checkpoints',
+    checkpoint_file='checkpoint_best.pt',
+    data_name_or_path='IMDB-bin'
+)
+roberta.eval()  # disable dropout
+```
+
+Finally you can make predictions using the `imdb_head` (or whatever you set
+`--classification-head-name` to during training):
+```python
+label_fn = lambda label: roberta.task.label_dictionary.string(
+    [label + roberta.task.label_dictionary.nspecial]
+)
+
+tokens = roberta.encode('Best movie this year')
+pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
+assert pred == '1'  # positive
+
+tokens = roberta.encode('Worst movie ever')
+pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
+assert pred == '0'  # negative
+```
--- a/examples/roberta/README.glue.md
+++ b/examples/roberta/README.glue.md
+# Finetuning RoBERTa on GLUE tasks
+
+### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
+```bash
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+python download_glue_data.py --data_dir glue_data --tasks all
+```
+
+### 2) Preprocess GLUE task data:
+```bash
+./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
+```
+`glue_task_name` is one of the following:
+`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
+Use `ALL` for preprocessing all the glue tasks.
+
+### 3) Fine-tuning on GLUE task:
+Example fine-tuning cmd for `RTE` task
+```bash
+TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
+WARMUP_UPDATES=122      # 6 percent of the number of updates
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+NUM_CLASSES=2
+MAX_SENTENCES=16        # Batch size.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+
+For each of the GLUE task, you will need to use following cmd-line arguments:
+
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
+`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
+`--batch-size` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
+`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
+`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
+
+For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
+
+**Note:**
+
+a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=16/32` depending on the task.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
+
+c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
+
+### Inference on GLUE task
+After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
+
+```python
+from fairseq.models.roberta import RobertaModel
+
+roberta = RobertaModel.from_pretrained(
+    'checkpoints/',
+    checkpoint_file='checkpoint_best.pt',
+    data_name_or_path='RTE-bin'
+)
+
+label_fn = lambda label: roberta.task.label_dictionary.string(
+    [label + roberta.task.label_dictionary.nspecial]
+)
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('glue_data/RTE/dev.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+        tokens = roberta.encode(sent1, sent2)
+        prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
+        prediction_label = label_fn(prediction)
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+
+```
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
+# RoBERTa: A Robustly Optimized BERT Pretraining Approach
+
+https://arxiv.org/abs/1907.11692
+
+## Introduction
+
+RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
+
+### What's New:
+
+- December 2020: German model (GottBERT) is available: [GottBERT](https://github.com/pytorch/fairseq/tree/master/examples/gottbert).
+- January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto).
+- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert).
+- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+- September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers).
+- August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers).
+- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset).
+- August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
+
+## Pre-trained models
+
+Model | Description | # params | Download
+---|---|---|---
+`roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
+`roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
+`roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
+`roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
+
+## Results
+
+**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
+_(dev set, single model, single-task finetuning)_
+
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
+`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
+`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
+
+**[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)**
+_(dev set, single model, single-task finetuning)_
+
+Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
+---|---|---|---|---|---|---|---
+`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | -
+`roberta.large.wsc` | - | - | - | - | - | - | 91.3
+
+**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
+_(dev set, no additional data used)_
+
+Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
+---|---|---
+`roberta.large` | 88.9/94.6 | 86.5/89.4
+
+**[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)**
+_(test set)_
+
+Model | Accuracy | Middle | High
+---|---|---|---
+`roberta.large` | 83.2 | 86.5 | 81.3
+
+**[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)**
+_(test set)_
+
+Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow
+---|---|---|---|---|---
+`roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9
+
+**[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)**
+_(test set)_
+
+Model | Accuracy
+---|---
+`roberta.large` (single model) | 72.1
+`roberta.large` (ensemble) | 72.5
+
+**[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)**
+_(test set)_
+
+Model | Accuracy
+---|---
+`roberta.large` | 78.1
+
+**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)**
+_(TRANSLATE-TEST)_
+
+Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur
+---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
+`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81
+
+## Example usage
+
+##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
+```python
+import torch
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
+roberta.eval()  # disable dropout (or leave in train mode to finetune)
+```
+
+##### Load RoBERTa (for PyTorch 1.0 or custom models):
+```python
+# Download roberta.large model
+wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
+tar -xzvf roberta.large.tar.gz
+
+# Load the model in fairseq
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt')
+roberta.eval()  # disable dropout (or leave in train mode to finetune)
+```
+
+##### Apply Byte-Pair Encoding (BPE) to input text:
+```python
+tokens = roberta.encode('Hello world!')
+assert tokens.tolist() == [0, 31414, 232, 328, 2]
+roberta.decode(tokens)  # 'Hello world!'
+```
+
+##### Extract features from RoBERTa:
+```python
+# Extract the last layer's features
+last_layer_features = roberta.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 5, 1024])
+
+# Extract all layer's features (layer 0 is the embedding layer)
+all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 25
+assert torch.all(all_layers[-1] == last_layer_features)
+```
+
+##### Use RoBERTa for sentence-pair classification tasks:
+```python
+# Download RoBERTa already finetuned for MNLI
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+roberta.eval()  # disable dropout for evaluation
+
+# Encode a pair of sentences and make a prediction
+tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
+roberta.predict('mnli', tokens).argmax()  # 0: contradiction
+
+# Encode another pair of sentences
+tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')
+roberta.predict('mnli', tokens).argmax()  # 2: entailment
+```
+
+##### Register a new (randomly initialized) classification head:
+```python
+roberta.register_classification_head('new_task', num_classes=3)
+logprobs = roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
+```
+
+##### Batched prediction:
+```python
+import torch
+from fairseq.data.data_utils import collate_tokens
+
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+roberta.eval()
+
+batch_of_pairs = [
+    ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
+    ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
+    ['potatoes are awesome.', 'I like to run.'],
+    ['Mars is very far from earth.', 'Mars is very close.'],
+]
+
+batch = collate_tokens(
+    [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
+)
+
+logprobs = roberta.predict('mnli', batch)
+print(logprobs.argmax(dim=1))
+# tensor([0, 2, 1, 0])
+```
+
+##### Using the GPU:
+```python
+roberta.cuda()
+roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
+```
+
+## Advanced usage
+
+#### Filling masks:
+
+RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the
+[Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/):
+```python
+roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3)
+# [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')]
+
+roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3)
+# [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')]
+
+roberta.fill_mask('<mask> is the common currency of the European Union', topk=3)
+# [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')]
+```
+
+#### Pronoun disambiguation (Winograd Schema Challenge):
+
+RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model:
+```bash
+pip install spacy
+python -m spacy download en_core_web_lg
+```
+
+Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun`
+function. The pronoun should be surrounded by square brackets (`[]`) and the
+query referent surrounded by underscores (`_`), or left blank to return the
+predicted candidate text directly:
+```python
+roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')
+roberta.cuda()  # use the GPU (optional)
+
+roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
+# True
+roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.')
+# False
+
+roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.')
+# 'The city councilmen'
+roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.')
+# 'demonstrators'
+```
+
+See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model.
+
+#### Extract features aligned to words:
+
+By default RoBERTa outputs one feature vector per BPE token. You can instead
+realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
+with the `extract_features_aligned_to_words` method. This will compute a
+weighted average of the BPE-level features for each word and expose them in
+spaCy's `Token.vector` attribute:
+```python
+doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
+assert len(doc) == 10
+for tok in doc:
+    print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
+# <s>       tensor([-0.1316, -0.0386, -0.0832, -0.0477,  0.1943], grad_fn=<SliceBackward>) (...)
+# I         tensor([ 0.0559,  0.1541, -0.4832,  0.0880,  0.0120], grad_fn=<SliceBackward>) (...)
+# said      tensor([-0.1565, -0.0069, -0.8915,  0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
+# ,         tensor([-0.1318, -0.0387, -0.0834, -0.0477,  0.1944], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.0486,  0.1818, -0.3946, -0.0553,  0.0981], grad_fn=<SliceBackward>) (...)
+# hello     tensor([ 0.0079,  0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
+# RoBERTa   tensor([-0.2339, -0.1184, -0.7343, -0.0492,  0.5829], grad_fn=<SliceBackward>) (...)
+# .         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# </s>      tensor([-0.0930, -0.0392, -0.0821,  0.0158,  0.0649], grad_fn=<SliceBackward>) (...)
+```
+
+#### Evaluating the `roberta.large.mnli` model:
+
+Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
+```python
+label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
+ncorrect, nsamples = 0, 0
+roberta.cuda()
+roberta.eval()
+with open('glue_data/MNLI/dev_matched.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
+        tokens = roberta.encode(sent1, sent2)
+        prediction = roberta.predict('mnli', tokens).argmax().item()
+        prediction_label = label_map[prediction]
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# Expected output: 0.9060
+```
+
+## Finetuning
+
+- [Finetuning on GLUE](README.glue.md)
+- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
+- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
+- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
+
+## Pretraining using your own data
+
+See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
+
+## Citation
+
+```bibtex
+@article{liu2019roberta,
+    title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
+    author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
+              Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
+              Luke Zettlemoyer and Veselin Stoyanov},
+    journal={arXiv preprint arXiv:1907.11692},
+    year = {2019},
+}
+```
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
+# Pretraining RoBERTa using your own data
+
+This tutorial will walk you through pretraining RoBERTa over your own data.
+
+### 1) Preprocess the data
+
+Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training.
+
+We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)
+to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course
+this dataset is quite small, so the resulting pretrained model will perform
+poorly, but it gives the general idea.
+
+First download the dataset:
+```bash
+wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
+unzip wikitext-103-raw-v1.zip
+```
+
+Next encode it with the GPT-2 BPE:
+```bash
+mkdir -p gpt2_bpe
+wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+for SPLIT in train valid test; do \
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json gpt2_bpe/encoder.json \
+        --vocab-bpe gpt2_bpe/vocab.bpe \
+        --inputs wikitext-103-raw/wiki.${SPLIT}.raw \
+        --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
+        --keep-empty \
+        --workers 60; \
+done
+```
+
+Finally preprocess/binarize the data using the GPT-2 fairseq dictionary:
+```bash
+wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+fairseq-preprocess \
+    --only-source \
+    --srcdict gpt2_bpe/dict.txt \
+    --trainpref wikitext-103-raw/wiki.train.bpe \
+    --validpref wikitext-103-raw/wiki.valid.bpe \
+    --testpref wikitext-103-raw/wiki.test.bpe \
+    --destdir data-bin/wikitext-103 \
+    --workers 60
+```
+
+### 2) Train RoBERTa base
+```bash
+TOTAL_UPDATES=125000    # Total number of training steps
+WARMUP_UPDATES=10000    # Warmup the learning rate over this many updates
+PEAK_LR=0.0005          # Peak learning rate, adjust as needed
+TOKENS_PER_SAMPLE=512   # Max sequence length
+MAX_POSITIONS=512       # Num. positional embeddings (usually same as above)
+MAX_SENTENCES=16        # Number of sequences per batch (batch size)
+UPDATE_FREQ=16          # Increase the batch size 16x
+
+DATA_DIR=data-bin/wikitext-103
+
+fairseq-train --fp16 $DATA_DIR \
+    --task masked_lm --criterion masked_lm \
+    --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
+    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --batch-size $MAX_SENTENCES --update-freq $UPDATE_FREQ \
+    --max-update $TOTAL_UPDATES --log-format simple --log-interval 1
+```
+
+**Note:** You can optionally resume training the released RoBERTa base model by
+adding `--restore-file /path/to/roberta.base/model.pt`.
+
+**Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses
+a batch size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to
+further increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size
+of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need
+to reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate.
+Alternatively if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly
+to increase training speed.
+
+**Note:** The learning rate and batch size are tightly connected and need to be
+adjusted together. We generally recommend increasing the learning rate as you
+increase the batch size according to the following table (although it's also
+dataset dependent, so don't rely on the following values too closely):
+
+batch size | peak learning rate
+---|---
+256 | 0.0001
+2048 | 0.0005
+8192 | 0.0007
+
+### 3) Load your pretrained model
+```python
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data')
+assert isinstance(roberta.model, torch.nn.Module)
+```
--- a/examples/roberta/README.race.md
+++ b/examples/roberta/README.race.md
+# Finetuning RoBERTa on RACE tasks
+
+### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/)
+
+### 2) Preprocess RACE data:
+```bash
+python ./examples/roberta/preprocess_RACE.py --input-dir <input-dir> --output-dir <extracted-data-dir>
+./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
+```
+
+### 3) Fine-tuning on RACE:
+
+```bash
+MAX_EPOCH=5           # Number of training epochs.
+LR=1e-05              # Peak LR for fixed LR scheduler.
+NUM_CLASSES=4
+MAX_SENTENCES=1       # Batch size per GPU.
+UPDATE_FREQ=8         # Accumulate gradients to simulate training on 8 GPUs.
+DATA_DIR=/path/to/race-output-dir
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --task sentence_ranking \
+    --num-classes $NUM_CLASSES \
+    --init-token 0 --separator-token 2 \
+    --max-option-length 128 \
+    --max-positions 512 \
+    --shorten-method "truncate" \
+    --arch roberta_large \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --criterion sentence_ranking \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler fixed --lr $LR \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --batch-size $MAX_SENTENCES \
+    --required-batch-size-multiple 1 \
+    --update-freq $UPDATE_FREQ \
+    --max-epoch $MAX_EPOCH
+```
+
+**Note:**
+
+a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
+
+c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
+
+### 4) Evaluation:
+
+```
+DATA_DIR=/path/to/race-output-dir       # data directory used during training
+MODEL_PATH=/path/to/checkpoint_best.pt  # path to the finetuned model checkpoint
+PREDS_OUT=preds.tsv                     # output file path to save prediction
+TEST_SPLIT=test                         # can be test (Middle) or test1 (High)
+fairseq-validate \
+    $DATA_DIR \
+    --valid-subset $TEST_SPLIT \
+    --path $MODEL_PATH \
+    --batch-size 1 \
+    --task sentence_ranking \
+    --criterion sentence_ranking \
+    --save-predictions $PREDS_OUT
+```
--- a/examples/roberta/commonsense_qa/README.md
+++ b/examples/roberta/commonsense_qa/README.md
+# Finetuning RoBERTa on Commonsense QA
+
+We follow a similar approach to [finetuning RACE](../README.race.md). Specifically
+for each question we construct five inputs, one for each of the five candidate
+answer choices. Each input is constructed by concatenating the question and
+candidate answer. We then encode each input and pass the resulting "[CLS]"
+representations through a fully-connected layer to predict the correct answer.
+We train with a standard cross-entropy loss.
+
+We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
+the answer. The complete input format is:
+```
+<s> Q: Where would I not want a fox? </s> A: hen house </s>
+```
+
+Our final submission is based on a hyperparameter search over the learning rate
+(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000,
+4000) and random seed. We selected the model with the best performance on the
+development set after 100 trials.
+
+### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
+```bash
+bash examples/roberta/commonsense_qa/download_cqa_data.sh
+```
+
+### 2) Finetune
+
+```bash
+MAX_UPDATES=3000      # Number of training steps.
+WARMUP_UPDATES=150    # Linearly increase LR over this many steps.
+LR=1e-05              # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16      # Batch size.
+SEED=1                # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+DATA_DIR=data/CommonsenseQA
+
+# we use the --user-dir option to load the task from
+# the examples/roberta/commonsense_qa directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
+
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \
+    $DATA_DIR \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --task commonsense_qa --init-token 0 --bpe gpt2 \
+    --arch roberta_large --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --criterion sentence_ranking --num-classes 5 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
+    --batch-size $MAX_SENTENCES \
+    --max-update $MAX_UPDATES \
+    --log-format simple --log-interval 25 \
+    --seed $SEED
+```
+
+The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with
+less memory, decrease `--batch-size` and increase `--update-freq`
+accordingly to compensate.
+
+### 3) Evaluate
+```python
+import json
+import torch
+from fairseq.models.roberta import RobertaModel
+from examples.roberta import commonsense_qa  # load the Commonsense QA task
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA')
+roberta.eval()  # disable dropout
+roberta.cuda()  # use the GPU (optional)
+nsamples, ncorrect = 0, 0
+with open('data/CommonsenseQA/valid.jsonl') as h:
+    for line in h:
+        example = json.loads(line)
+        scores = []
+        for choice in example['question']['choices']:
+            input = roberta.encode(
+                'Q: ' + example['question']['stem'],
+                'A: ' + choice['text'],
+                no_separator=True
+            )
+            score = roberta.predict('sentence_classification_head', input, return_logits=True)
+            scores.append(score)
+        pred = torch.cat(scores).argmax()
+        answer = ord(example['answerKey']) - ord('A')
+        nsamples += 1
+        if pred == answer:
+            ncorrect += 1
+
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.7846027846027847
+```
+
+The above snippet is not batched, which makes it quite slow. See [instructions
+for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction).
--- a/examples/roberta/commonsense_qa/__init__.py
+++ b/examples/roberta/commonsense_qa/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import commonsense_qa_task  # noqa
--- a/examples/roberta/commonsense_qa/commonsense_qa_task.py
+++ b/examples/roberta/commonsense_qa/commonsense_qa_task.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+
+import numpy as np
+import torch
+from fairseq.data import (
+    Dictionary,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+
+@register_task("commonsense_qa")
+class CommonsenseQATask(LegacyFairseqTask):
+    """Task to finetune RoBERTa for Commonsense QA."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
+        )
+        parser.add_argument(
+            "--init-token",
+            type=int,
+            default=None,
+            help="add token at the beginning of each batch item",
+        )
+        parser.add_argument("--num-classes", type=int, default=5)
+
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol("<mask>")
+
+        self.bpe = encoders.build_bpe(args)
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert (
+            args.criterion == "sentence_ranking"
+        ), "Must set --criterion=sentence_ranking"
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+
+        return cls(args, vocab)
+
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+
+        def binarize(s, append_bos=False):
+            if self.bpe is not None:
+                s = self.bpe.encode(s)
+            tokens = self.vocab.encode_line(
+                s,
+                append_eos=True,
+                add_if_not_exist=False,
+            ).long()
+            if append_bos and self.args.init_token is not None:
+                tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+            return tokens
+
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+
+        src_tokens = [[] for i in range(self.args.num_classes)]
+        src_lengths = [[] for i in range(self.args.num_classes)]
+        labels = []
+
+        with open(data_path) as h:
+            for line in h:
+                example = json.loads(line.strip())
+                if "answerKey" in example:
+                    label = ord(example["answerKey"]) - ord("A")
+                    labels.append(label)
+                question = example["question"]["stem"]
+                assert len(example["question"]["choices"]) == self.args.num_classes
+                # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>`
+                question = "Q: " + question
+                question_toks = binarize(question, append_bos=True)
+                for i, choice in enumerate(example["question"]["choices"]):
+                    src = "A: " + choice["text"]
+                    src_bin = torch.cat([question_toks, binarize(src)])
+                    src_tokens[i].append(src_bin)
+                    src_lengths[i].append(len(src_bin))
+        assert all(
+            len(src_tokens[0]) == len(src_tokens[i])
+            for i in range(self.args.num_classes)
+        )
+        assert len(src_tokens[0]) == len(src_lengths[0])
+        assert len(labels) == 0 or len(labels) == len(src_tokens[0])
+
+        for i in range(self.args.num_classes):
+            src_lengths[i] = np.array(src_lengths[i])
+            src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i])
+            src_lengths[i] = ListDataset(src_lengths[i])
+
+        dataset = {
+            "id": IdDataset(),
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for i in range(self.args.num_classes):
+            dataset.update(
+                {
+                    "net_input{}".format(i + 1): {
+                        "src_tokens": RightPadDataset(
+                            src_tokens[i],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        "src_lengths": src_lengths[i],
+                    }
+                }
+            )
+
+        if len(labels) > 0:
+            dataset.update({"target": RawLabelDataset(labels)})
+
+        dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            dataset = SortDataset(
+                dataset,
+                # shuffle
+                sort_order=[np.random.permutation(len(dataset))],
+            )
+
+        print("| Loaded {} with {} samples".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            "sentence_classification_head",
+            num_classes=1,
+        )
+
+        return model
+
+    @property
+    def source_dictionary(self):
+        return self.vocab
+
+    @property
+    def target_dictionary(self):
+        return self.vocab
--- a/examples/roberta/commonsense_qa/download_cqa_data.sh
+++ b/examples/roberta/commonsense_qa/download_cqa_data.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+OUTDIR=data/CommonsenseQA
+
+mkdir -p $OUTDIR
+
+wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
+wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
+wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
+wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
--- a/examples/roberta/multiprocessing_bpe_encoder.py
+++ b/examples/roberta/multiprocessing_bpe_encoder.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import contextlib
+import sys
+from collections import Counter
+from multiprocessing import Pool
+
+from fairseq.data.encoders.gpt2_bpe import get_encoder
+
+
+def main():
+    """
+    Helper script to encode raw text with the GPT-2 BPE using multiple processes.
+
+    The encoder.json and vocab.bpe files can be obtained here:
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--encoder-json",
+        help="path to encoder.json",
+    )
+    parser.add_argument(
+        "--vocab-bpe",
+        type=str,
+        help="path to vocab.bpe",
+    )
+    parser.add_argument(
+        "--inputs",
+        nargs="+",
+        default=["-"],
+        help="input files to filter/encode",
+    )
+    parser.add_argument(
+        "--outputs",
+        nargs="+",
+        default=["-"],
+        help="path to save encoded outputs",
+    )
+    parser.add_argument(
+        "--keep-empty",
+        action="store_true",
+        help="keep empty lines",
+    )
+    parser.add_argument("--workers", type=int, default=20)
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(
+        args.outputs
+    ), "number of input and output paths should match"
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-"
+            else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-"
+            else sys.stdout
+            for output in args.outputs
+        ]
+
+        encoder = MultiprocessingEncoder(args)
+        pool = Pool(args.workers, initializer=encoder.initializer)
+        encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)
+
+        stats = Counter()
+        for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
+            if filt == "PASS":
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(enc_line, file=output_h)
+            else:
+                stats["num_filtered_" + filt] += 1
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        for k, v in stats.most_common():
+            print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
+
+
+class MultiprocessingEncoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        global bpe
+        bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
+
+    def encode(self, line):
+        global bpe
+        ids = bpe.encode(line)
+        return list(map(str, ids))
+
+    def decode(self, tokens):
+        global bpe
+        return bpe.decode(tokens)
+
+    def encode_lines(self, lines):
+        """
+        Encode a set of lines. All lines will be encoded together.
+        """
+        enc_lines = []
+        for line in lines:
+            line = line.strip()
+            if len(line) == 0 and not self.args.keep_empty:
+                return ["EMPTY", None]
+            tokens = self.encode(line)
+            enc_lines.append(" ".join(tokens))
+        return ["PASS", enc_lines]
+
+    def decode_lines(self, lines):
+        dec_lines = []
+        for line in lines:
+            tokens = map(int, line.strip().split())
+            dec_lines.append(self.decode(tokens))
+        return ["PASS", dec_lines]
+
+
+if __name__ == "__main__":
+    main()