Commit c394d7d1 authored by “change”'s avatar “change”
Browse files

init

parents
#!/usr/bin/env python3 -u
import argparse
import fileinput
import logging
import os
import sys
from fairseq.models.transformer import TransformerModel
logging.getLogger().setLevel(logging.INFO)
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("--en2fr", required=True, help="path to en2fr model")
parser.add_argument(
"--fr2en", required=True, help="path to fr2en mixture of experts model"
)
parser.add_argument(
"--user-dir", help="path to fairseq examples/translation_moe/src directory"
)
parser.add_argument(
"--num-experts",
type=int,
default=10,
help="(keep at 10 unless using a different model)",
)
parser.add_argument(
"files",
nargs="*",
default=["-"],
help='input files to paraphrase; "-" for stdin',
)
args = parser.parse_args()
if args.user_dir is None:
args.user_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # examples/
"translation_moe",
"src",
)
if os.path.exists(args.user_dir):
logging.info("found user_dir:" + args.user_dir)
else:
raise RuntimeError(
"cannot find fairseq examples/translation_moe/src "
"(tried looking here: {})".format(args.user_dir)
)
logging.info("loading en2fr model from:" + args.en2fr)
en2fr = TransformerModel.from_pretrained(
model_name_or_path=args.en2fr,
tokenizer="moses",
bpe="sentencepiece",
).eval()
logging.info("loading fr2en model from:" + args.fr2en)
fr2en = TransformerModel.from_pretrained(
model_name_or_path=args.fr2en,
tokenizer="moses",
bpe="sentencepiece",
user_dir=args.user_dir,
task="translation_moe",
).eval()
def gen_paraphrases(en):
fr = en2fr.translate(en)
return [
fr2en.translate(fr, inference_step_args={"expert": i})
for i in range(args.num_experts)
]
logging.info("Type the input sentence and press return:")
for line in fileinput.input(args.files):
line = line.strip()
if len(line) == 0:
continue
for paraphrase in gen_paraphrases(line):
print(paraphrase)
if __name__ == "__main__":
main()
# Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)
This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://arxiv.org/abs/1901.10430).
## Citation:
```bibtex
@inproceedings{wu2018pay,
title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
author = {Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli},
booktitle = {International Conference on Learning Representations},
year = {2019},
url = {https://arxiv.org/abs/1901.10430},
}
```
## Translation
### Pre-trained models
For some datasets we release models without GLUs which are faster at inference.
Model | Description | Dataset | Download
---|---|---|---
`lightconv.no_glu.iwslt14.de-en` | LightConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
`dynamicconv.no_glu.iwslt14.de-en` | DynamicConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
`lightconv.no_glu.wmt16.en-de` | LightConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
`dynamicconv.no_glu.wmt16.en-de` | DynamicConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
`lightconv.glu.wmt16.en-de` | LightConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
`dynamicconv.glu.wmt16.en-de` | DynamicConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
`lightconv.glu.wmt14.en-fr` | LightConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
`dynamicconv.glu.wmt14.en-fr` | DynamicConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
`lightconv.glu.wmt17.zh-en` | LightConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
`dynamicconv.glu.wmt17.zh-en` | DynamicConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
### Memory-Efficient CUDA Kernels
Since the PyTorch implementations of Light/Dynamic conv are quite memory intensive, we have developed CUDA kernels that implement the light and dynamic convolution operator in a memory-efficient and performant manner. For large sequence lengths, these kernels save about 50% memory compared to the PyTorch equivalent.
To install the kernels, use the commands below. Once installed, they will automatically be used in place of the PyTorch implementations whenever a light or dynamic convolution is used.
```sh
# to install lightconv
cd fairseq/modules/lightconv_layer
python cuda_function_gen.py
python setup.py install
# to install dynamicconv
cd fairseq/modules/dynamicconv_layer
python cuda_function_gen.py
python setup.py install
```
### Example usage (torch.hub)
We require a few additional Python dependencies for preprocessing:
```bash
pip install sacremoses subword_nmt
```
Interactive translation via PyTorch Hub:
```python
import torch
# List available models
torch.hub.list('pytorch/fairseq') # [..., 'lightconv.glu.wmt17.zh-en', ... ]
# Load a transformer trained on WMT'16 En-De
zh2en = torch.hub.load('pytorch/fairseq', 'lightconv.glu.wmt17.zh-en', tokenizer='moses', bpe='subword_nmt')
# The underlying model is available under the *models* attribute
assert isinstance(zh2en.models[0], fairseq.models.lightconv.LightConvModel)
# Translate a sentence
zh2en.translate('你好 世界')
# 'Hello World'
```
Loading custom models:
```python
from fairseq.models.lightconv import LightConvModel
en2fr = LightConvModel.from_pretrained(
'/path/to/checkpoints',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='data-bin/wmt14_en_fr',
bpe='subword_nmt',
bpe_codes='data-bin/wmt14_en_fr/en.code'
)
en2fr.translate('Hello world!')
# 'Bonjour le monde'
```
### Preprocessing the training datasets
Please follow the instructions in [`examples/translation/README.md`](../translation/README.md) to preprocess the data.
### Training and evaluation options:
To use the model without GLU, please set `--encoder-glu 0 --decoder-glu 0`.
For LightConv, please use `--encoder-conv-type lightweight --decoder-conv-type lightweight`, otherwise the default is DynamicConv.
For best BLEU results, lenpen may need to be manually tuned.
To use the CUDA kernels, first install the PyTorch modules using the commands
above. Once the CUDA modules are installed, they will automatically be used
instead of the PyTorch modules.
### IWSLT14 De-En
Training and evaluating DynamicConv (without GLU) on a GPU:
```sh
# Training
SAVE="save/dynamic_conv_iwslt"
mkdir -p $SAVE
CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
--clip-norm 0 --optimizer adam --lr 0.0005 \
--source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \
--log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler inverse_sqrt \
--ddp-backend=legacy_ddp \
--max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
--adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
-a lightconv_iwslt_de_en --save-dir $SAVE \
--dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
--encoder-glu 0 --decoder-glu 0
python scripts/average_checkpoints.py --inputs $SAVE \
--num-epoch-checkpoints 10 --output "${SAVE}/checkpoint_last10_avg.pt"
# Evaluation
CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/iwslt14.tokenized.de-en --path "${SAVE}/checkpoint_last10_avg.pt" --batch-size 128 --beam 4 --remove-bpe --lenpen 1 --gen-subset test --quiet
```
### WMT16 En-De
Training and evaluating DynamicConv (with GLU) on WMT16 En-De using cosine scheduler on one machine with 8 V100 GPUs:
```sh
# Training
SAVE="save/dynamic_conv_wmt16en2de"
mkdir -p $SAVE
python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
data-bin/wmt16_en_de_bpe32k --fp16 --log-interval 100 --no-progress-bar \
--max-update 30000 --share-all-embeddings --optimizer adam \
--adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
--ddp-backend=legacy_ddp --max-tokens 3584 \
--lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
--lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
--t-mult 1 --lr-period-updates 20000 \
--arch lightconv_wmt_en_de_big --save-dir $SAVE \
--dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
--encoder-glu 1 --decoder-glu 1
# Evaluation
CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt16.en-de.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.5 --gen-subset test > wmt16_gen.txt
bash scripts/compound_split_bleu.sh wmt16_gen.txt
```
### WMT14 En-Fr
Training DynamicConv (with GLU) on WMT14 En-Fr using cosine scheduler on one machine with 8 V100 GPUs:
```sh
# Training
SAVE="save/dynamic_conv_wmt14en2fr"
mkdir -p $SAVE
python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
data-bin/wmt14_en_fr --fp16 --log-interval 100 --no-progress-bar \
--max-update 30000 --share-all-embeddings --optimizer adam \
--adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
--ddp-backend=legacy_ddp --max-tokens 3584 \
--lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
--lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
--t-mult 1 --lr-period-updates 70000 \
--arch lightconv_wmt_en_fr_big --save-dir $SAVE \
--dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \
--encoder-glu 1 --decoder-glu 1
# Evaluation
CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt14.en-fr.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.9 --gen-subset test
```
# Transformer with Pointer-Generator Network
This page describes the `transformer_pointer_generator` model that incorporates
a pointing mechanism in the Transformer model that facilitates copying of input
words to the output. This architecture is described in [Enarvi et al. (2020)](https://www.aclweb.org/anthology/2020.nlpmc-1.4/).
## Background
The pointer-generator network was introduced in [See et al. (2017)](https://arxiv.org/abs/1704.04368)
for RNN encoder-decoder attention models. A similar mechanism can be
incorporated in a Transformer model by reusing one of the many attention
distributions for pointing. The attention distribution over the input words is
interpolated with the normal output distribution over the vocabulary words. This
allows the model to generate words that appear in the input, even if they don't
appear in the vocabulary, helping especially with small vocabularies.
## Implementation
The mechanism for copying out-of-vocabulary words from the input has been
implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator)
they convey the word identities through the model in order to be able to produce
words that appear in the input sequence but not in the vocabulary. A different
approach was taken in the Fairseq implementation to keep it self-contained in
the model file, avoiding any changes to the rest of the code base. Copying
out-of-vocabulary words is possible by pre-processing the input and
post-processing the output. This is described in detail in the next section.
## Usage
The training and evaluation procedure is outlined below. You can also find a
more detailed example for the XSum dataset on [this page](README.xsum.md).
##### 1. Create a vocabulary and extend it with source position markers
The pointing mechanism is especially helpful with small vocabularies, if we are
able to recover the identities of any out-of-vocabulary words that are copied
from the input. For this purpose, the model allows extending the vocabulary with
special tokens that can be used in place of `<unk>` tokens to identify different
input positions. For example, the user may add `<unk-0>`, `<unk-1>`, `<unk-2>`,
etc. to the end of the vocabulary, after the normal words. Below is an example
of how to create a vocabulary of 10000 most common words and add 1000 input
position markers.
```bash
vocab_size=10000
position_markers=1000
export LC_ALL=C
cat train.src train.tgt |
tr -s '[:space:]' '\n' |
sort |
uniq -c |
sort -k1,1bnr -k2 |
head -n "$((vocab_size - 4))" |
awk '{ print $2 " " $1 }' >dict.pg.txt
python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
```
##### 2. Preprocess the text data
The idea is that any `<unk>` tokens in the text are replaced with `<unk-0>` if
it appears in the first input position, `<unk-1>` if it appears in the second
input position, and so on. This can be achieved using the `preprocess.py` script
that is provided in this directory.
##### 3. Train a model
The number of these special tokens is given to the model with the
`--source-position-markers` argument—the model simply maps all of these to the
same word embedding as `<unk>`.
The attention distribution that is used for pointing is selected using the
`--alignment-heads` and `--alignment-layer` command-line arguments in the same
way as with the `transformer_align` model.
##### 4. Generate text and postprocess it
When using the model to generate text, you want to preprocess the input text in
the same way that training data was processed, replacing out-of-vocabulary words
with `<unk-N>` tokens. If any of these tokens are copied to the output, the
actual words can be retrieved from the unprocessed input text. Any `<unk-N>`
token should be replaced with the word at position N in the original input
sequence. This can be achieved using the `postprocess.py` script.
## Training a pointer-generator model on the Extreme Summarization dataset
##### 1. Download the Extreme Summarization data and preprocess it
Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to obtain
the original Extreme Summarization dataset. You should have six files,
{train,validation,test}.{document,summary}.
##### 2. Create a vocabulary and extend it with source position markers
```bash
vocab_size=10000
position_markers=1000
export LC_ALL=C
cat train.document train.summary |
tr -s '[:space:]' '\n' |
sort |
uniq -c |
sort -k1,1bnr -k2 |
head -n "$((vocab_size - 4))" |
awk '{ print $2 " " $1 }' >dict.pg.txt
python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
```
This creates the file dict.pg.txt that contains the 10k most frequent words,
followed by 1k source position markers:
```
the 4954867
. 4157552
, 3439668
to 2212159
a 1916857
of 1916820
and 1823350
...
<unk-0> 0
<unk-1> 0
<unk-2> 0
<unk-3> 0
<unk-4> 0
...
```
##### 2. Preprocess the text data
```bash
./preprocess.py --source train.document --target train.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out train.pg.src --target-out train.pg.tgt
./preprocess.py --source validation.document --target validation.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out valid.pg.src --target-out valid.pg.tgt
./preprocess.py --source test.document --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out test.pg.src
```
The data should now contain `<unk-N>` tokens in place of out-of-vocabulary words.
##### 3. Binarize the dataset:
```bash
fairseq-preprocess \
--source-lang src \
--target-lang tgt \
--trainpref train.pg \
--validpref valid.pg \
--destdir bin \
--workers 60 \
--srcdict dict.pg.txt \
--joined-dictionary
```
##### 3. Train a model
```bash
total_updates=20000
warmup_updates=500
lr=0.001
max_tokens=4096
update_freq=4
pointer_layer=-2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \
--user-dir examples/pointer_generator/pointer_generator_src \
--max-tokens "$max_tokens" \
--task translation \
--source-lang src --target-lang tgt \
--truncate-source \
--layernorm-embedding \
--share-all-embeddings \
--encoder-normalize-before \
--decoder-normalize-before \
--required-batch-size-multiple 1 \
--arch transformer_pointer_generator \
--alignment-layer "$pointer_layer" \
--alignment-heads 1 \
--source-position-markers 1000 \
--criterion label_smoothed_cross_entropy \
--label-smoothing 0.1 \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
--clip-norm 0.1 \
--lr-scheduler inverse_sqrt --lr "$lr" --max-update "$total_updates" --warmup-updates "$warmup_updates" \
--update-freq "$update_freq" \
--skip-invalid-size-inputs-valid-test
```
Above we specify that our dictionary contains 1000 source position markers, and
that we want to use one attention head from the penultimate decoder layer for
pointing. It should run in 5.5 hours on one node with eight 32GB V100 GPUs. The
logged messages confirm that dictionary indices above 10000 will be mapped to
the `<unk>` embedding:
```
2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [src] dictionary: 11000 types
2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [tgt] dictionary: 11000 types
2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.src
2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.tgt
2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | bin valid src-tgt 11332 examples
2020-09-24 20:43:53 | INFO | fairseq.models.transformer_pg | dictionary indices from 10000 to 10999 will be mapped to 3
```
##### 4. Summarize the test sequences
```bash
batch_size=32
beam_size=6
max_length=60
length_penalty=1.0
fairseq-interactive bin \
--user-dir examples/pointer_generator/pointer_generator_src \
--batch-size "$batch_size" \
--task translation \
--source-lang src --target-lang tgt \
--path checkpoints/checkpoint_last.pt \
--input test.pg.src \
--buffer-size 200 \
--max-len-a 0 \
--max-len-b "$max_length" \
--lenpen "$length_penalty" \
--beam "$beam_size" \
--skip-invalid-size-inputs-valid-test |
tee generate.out
grep ^H generate.out | cut -f 3- >generate.hyp
```
Now you should have the generated sequences in `generate.hyp`. They contain
`<unk-N>` tokens that the model has copied from the source sequence. In order to
retrieve the original words, we need the unprocessed source sequences from
`test.document`.
##### 5. Process the generated output
Since we skipped too long inputs when producing `generate.hyp`, we also have to
skip too long sequences now that we read `test.document`.
```bash
./postprocess.py \
--source <(awk 'NF<1024' test.document) \
--target generate.hyp \
--target-out generate.hyp.processed
```
Now you'll find the final sequences from `generate.hyp.processed`, with
`<unk-N>` replaced with the original word from the source sequence.
##### An example of a summarized sequence
The original source document in `test.document`:
> de roon moved to teesside in june 2016 for an initial # 8.8 m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
The preprocessed source document in `test.src.pg`:
> de \<unk-1> moved to \<unk-4> in june 2016 for an initial # \<unk-12> m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
The generated summary in `generate.hyp`:
> middlesbrough striker \<unk> de \<unk-1> has joined spanish side \<unk> on a season-long loan .
The generated summary after postprocessing in `generate.hyp.processed`:
> middlesbrough striker \<unk> de roon has joined spanish side \<unk> on a season-long loan .
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import transformer_pg # noqa
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from typing import Any, Dict, Optional, List, Tuple
import torch
import torch.nn as nn
from fairseq import utils
from fairseq.models import register_model, register_model_architecture
from fairseq.models.transformer import (
DEFAULT_MAX_SOURCE_POSITIONS,
DEFAULT_MAX_TARGET_POSITIONS,
TransformerDecoder,
TransformerEncoder,
TransformerModel,
base_architecture,
)
from torch import Tensor
logger = logging.getLogger(__name__)
@register_model("transformer_pointer_generator")
class TransformerPointerGeneratorModel(TransformerModel):
"""
Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017)
<https://arxiv.org/abs/1706.03762>`_, augmented with a pointer-generator
network from `"Get To The Point: Summarization with Pointer-Generator
Networks" (See et al, 2017) <https://arxiv.org/abs/1704.04368>`_.
Args:
encoder (TransformerPointerGeneratorEncoder): the encoder
decoder (TransformerPointerGeneratorDecoder): the decoder
The Transformer pointer-generator model provides the following named
architectures and command-line arguments:
.. argparse::
:ref: fairseq.models.transformer_pointer_generator_parser
:prog:
"""
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
TransformerModel.add_args(parser)
parser.add_argument('--alignment-heads', type=int, metavar='N',
help='number of attention heads to be used for '
'pointing')
parser.add_argument('--alignment-layer', type=int, metavar='I',
help='layer number to be used for pointing (0 '
'corresponding to the bottommost layer)')
parser.add_argument('--source-position-markers', type=int, metavar='N',
help='dictionary includes N additional items that '
'represent an OOV token at a particular input '
'position')
parser.add_argument('--force-generation', type=float, metavar='P',
default=None,
help='set the vocabulary distribution weight to P, '
'instead of predicting it from the input (1.0 '
'corresponding to generation, 0.0 to pointing)')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_architecture(args)
if args.encoder_layers_to_keep:
args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
if args.decoder_layers_to_keep:
args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
if getattr(args, "max_source_positions", None) is None:
args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
if getattr(args, "max_target_positions", None) is None:
args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
if getattr(args, "source_position_markers", None) is None:
args.source_position_markers = args.max_source_positions
src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
if src_dict != tgt_dict:
raise ValueError("Pointer-generator requires a joined dictionary")
def build_embedding(dictionary, embed_dim, path=None):
# The dictionary may include additional items that can be used in
# place of the normal OOV token and that all map to the same
# embedding. Using a different token for each input position allows
# one to restore the word identities from the original source text.
num_embeddings = len(dictionary) - args.source_position_markers
padding_idx = dictionary.pad()
unk_idx = dictionary.unk()
logger.info(
"dictionary indices from {0} to {1} will be mapped to {2}".format(
num_embeddings, len(dictionary) - 1, unk_idx
)
)
emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx)
# if provided, load from preloaded dictionaries
if path:
embed_dict = utils.parse_embedding(path)
utils.load_embedding(embed_dict, dictionary, emb)
return emb
if args.share_all_embeddings:
if args.encoder_embed_dim != args.decoder_embed_dim:
raise ValueError(
"--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
)
if args.decoder_embed_path and (
args.decoder_embed_path != args.encoder_embed_path
):
raise ValueError(
"--share-all-embeddings not compatible with --decoder-embed-path"
)
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = encoder_embed_tokens
args.share_decoder_input_output_embed = True
else:
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = build_embedding(
tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
)
encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
return cls(args, encoder, decoder)
@classmethod
def build_encoder(cls, args, src_dict, embed_tokens):
return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens)
@classmethod
def build_decoder(cls, args, tgt_dict, embed_tokens):
return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens)
class TransformerPointerGeneratorEncoder(TransformerEncoder):
"""
Transformer encoder consisting of *args.encoder_layers* layers. Each layer
is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds
the source tokens to the encoder output as these are otherwise not passed
to the decoder.
"""
def forward(
self,
src_tokens,
src_lengths: Optional[Tensor] = None,
return_all_hiddens: bool = False,
token_embeddings: Optional[Tensor] = None
):
"""
Runs the `forward()` method of the parent Transformer class. Then adds
the source tokens into the encoder output tuple.
While it might be more elegant that the model would pass the source
tokens to the `forward()` method of the decoder too, this would require
changes to `SequenceGenerator`.
Args:
src_tokens (torch.LongTensor): tokens in the source language of
shape `(batch, src_len)`
src_lengths (torch.LongTensor): lengths of each source sentence of
shape `(batch)`
return_all_hiddens (bool, optional): also return all of the
intermediate hidden states (default: False).
token_embeddings (torch.Tensor, optional): precomputed embeddings
default `None` will recompute embeddings
Returns:
namedtuple:
- **encoder_out** (Tensor): the last encoder layer's output of
shape `(src_len, batch, embed_dim)`
- **encoder_padding_mask** (ByteTensor): the positions of
padding elements of shape `(batch, src_len)`
- **encoder_embedding** (Tensor): the (scaled) embedding lookup
of shape `(batch, src_len, embed_dim)`
- **encoder_states** (List[Tensor]): all intermediate
hidden states of shape `(src_len, batch, embed_dim)`.
Only populated if *return_all_hiddens* is True.
- **src_tokens** (Tensor): input token ids of shape
`(batch, src_len)`
"""
encoder_out = self.forward_scriptable(src_tokens,
src_lengths,
return_all_hiddens,
token_embeddings)
# The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
# `forward` so we use a dictionary instead.
# TorchScript does not support mixed values so the values are all lists.
# The empty list is equivalent to None.
return {
"encoder_out": encoder_out["encoder_out"], # T x B x C
"encoder_padding_mask": encoder_out["encoder_padding_mask"], # B x T
"encoder_embedding": encoder_out["encoder_embedding"], # B x T x C
"encoder_states": encoder_out["encoder_states"], # List[T x B x C]
"src_tokens": [src_tokens], # B x T
"src_lengths": [],
}
class TransformerPointerGeneratorDecoder(TransformerDecoder):
"""
Transformer decoder consisting of *args.decoder_layers* layers. Each layer
is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes
the output probabilities with an attention distribution in the output layer.
Args:
args (argparse.Namespace): parsed command-line arguments
dictionary (~fairseq.data.Dictionary): decoding dictionary
embed_tokens (torch.nn.Embedding): output embedding
"""
def __init__(self, args, dictionary, embed_tokens):
super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False)
# In the pointer-generator model these arguments define the decoder
# layer and the number of attention heads that will be averaged to
# create the alignment for pointing.
self.alignment_heads = args.alignment_heads
self.alignment_layer = args.alignment_layer
input_embed_dim = embed_tokens.embedding_dim
# Generation probabilities / interpolation coefficients are predicted
# from the current decoder input embedding and the decoder output, which
# is the size of output_embed_dim.
p_gen_input_size = input_embed_dim + self.output_embed_dim
self.project_p_gens = nn.Linear(p_gen_input_size, 1)
nn.init.zeros_(self.project_p_gens.bias)
# The dictionary may include a separate entry for an OOV token in each
# input position, so that their identity can be restored from the
# original source text.
self.num_types = len(dictionary)
self.num_oov_types = args.source_position_markers
self.num_embeddings = self.num_types - self.num_oov_types
self.force_p_gen = args.force_generation
def forward(
self,
prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
features_only: bool = False,
alignment_layer: Optional[int] = 0,
alignment_heads: Optional[int] = 1,
src_lengths: Optional[Any] = None,
return_all_hiddens: bool = False,
):
"""
Args:
prev_output_tokens (LongTensor): previous decoder outputs of shape
`(batch, tgt_len)`, for teacher forcing
encoder_out (optional): output from the encoder, used for
encoder-side attention
incremental_state (dict, optional): dictionary used for storing
state during :ref:`Incremental decoding`
features_only (bool, optional): only return features without
applying output layer (default: False)
alignment_layer (int, optional): 0-based index of the layer to be
used for pointing (default: 0)
alignment_heads (int, optional): number of attention heads to be
used for pointing (default: 1)
Returns:
tuple:
- the decoder's output of shape `(batch, tgt_len, vocab)`
- a dictionary with any model-specific outputs
"""
# The normal Transformer model doesn't pass the alignment_layer and
# alignment_heads parameters correctly. We use our local variables.
x, extra = self.extract_features(
prev_output_tokens,
encoder_out=encoder_out,
incremental_state=incremental_state,
alignment_layer=self.alignment_layer,
alignment_heads=self.alignment_heads,
)
if not features_only:
# Embedding the tokens again for generation probability prediction,
# so that we don't have to reimplement the whole extract_features()
# method.
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
prev_output_embed = self.embed_tokens(prev_output_tokens)
prev_output_embed *= self.embed_scale
predictors = torch.cat((prev_output_embed, x), 2)
p_gens = self.project_p_gens(predictors)
p_gens = torch.sigmoid(p_gens.float())
# Torchscript complains if encoder_out or attn are None because
# `output_layer()` signature expects tensors instead
attn: Optional[Tensor] = extra["attn"][0]
assert encoder_out is not None
assert attn is not None
x = self.output_layer(x, attn, encoder_out["src_tokens"][0], p_gens)
return x, extra
def output_layer(
self,
features: Tensor,
attn: Tensor,
src_tokens: Tensor,
p_gens: Tensor
) -> Tensor:
"""
Project features to the vocabulary size and mix with the attention
distributions.
"""
if self.force_p_gen is not None:
p_gens = self.force_p_gen
# project back to size of vocabulary
if self.adaptive_softmax is None:
logits = self.output_projection(features)
else:
logits = features
batch_size = logits.shape[0]
output_length = logits.shape[1]
assert logits.shape[2] == self.num_embeddings
assert src_tokens.shape[0] == batch_size
src_length = src_tokens.shape[1]
# The final output distribution will be a mixture of the normal output
# distribution (softmax of logits) and attention weights.
gen_dists = self.get_normalized_probs_scriptable(
(logits, None), log_probs=False, sample=None
)
gen_dists = torch.mul(gen_dists, p_gens)
padding_size = (batch_size, output_length, self.num_oov_types)
padding = gen_dists.new_zeros(padding_size)
gen_dists = torch.cat((gen_dists, padding), 2)
assert gen_dists.shape[2] == self.num_types
# Scatter attention distributions to distributions over the extended
# vocabulary in a tensor of shape [batch_size, output_length,
# vocab_size]. Each attention weight will be written into a location
# that is for other dimensions the same as in the index tensor, but for
# the third dimension it's the value of the index tensor (the token ID).
attn = torch.mul(attn.float(), 1 - p_gens)
index = src_tokens[:, None, :]
index = index.expand(batch_size, output_length, src_length)
attn_dists_size = (batch_size, output_length, self.num_types)
attn_dists = attn.new_zeros(attn_dists_size)
attn_dists.scatter_add_(2, index, attn.float())
# Final distributions, [batch_size, output_length, num_types].
return gen_dists + attn_dists
def get_normalized_probs(
self,
net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
log_probs: bool,
sample: Optional[Dict[str, Tensor]] = None,
):
"""
Get normalized probabilities (or log probs) from a net's output.
Pointer-generator network output is already normalized.
"""
probs = net_output[0]
# Make sure the probabilities are greater than zero when returning log
# probabilities.
return probs.clamp(1e-10, 1.0).log() if log_probs else probs
class Embedding(nn.Embedding):
r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
This module is often used to store word embeddings and retrieve them using indices.
The input to the module is a list of indices, and the output is the corresponding
word embeddings. This subclass differs from the standard PyTorch Embedding class by
allowing additional vocabulary entries that will be mapped to the unknown token
embedding.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx`
(initialized to zeros) whenever it encounters the index.
unk_idx (int): Maps all token indices that are greater than or equal to
num_embeddings to this index.
Attributes:
weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
initialized from :math:`\mathcal{N}(0, 1)`
Shape:
- Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
- Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
.. note::
Keep in mind that only a limited number of optimizers support
sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
:class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
.. note::
With :attr:`padding_idx` set, the embedding vector at
:attr:`padding_idx` is initialized to all zeros. However, note that this
vector can be modified afterwards, e.g., using a customized
initialization method, and thus changing the vector used to pad the
output. The gradient for this vector from :class:`~torch.nn.Embedding`
is always zero.
"""
__constants__ = ["unk_idx"]
# Torchscript: Inheriting from Embedding class produces an error when exporting to Torchscript
# -> RuntimeError: Unable to cast Python instance to C++ type (compile in debug mode for details
# It's happening because max_norm attribute from nn.Embedding is None by default and it cannot be
# cast to a C++ type
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
padding_idx: Optional[int],
unk_idx: int,
max_norm: Optional[float] = float("inf"),
):
super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, max_norm=max_norm)
self.unk_idx = unk_idx
nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5)
nn.init.constant_(self.weight[padding_idx], 0)
def forward(self, input):
input = torch.where(
input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input
)
return nn.functional.embedding(
input, self.weight, self.padding_idx, self.max_norm,
self.norm_type, self.scale_grad_by_freq, self.sparse
)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator"
)
def transformer_pointer_generator(args):
args.alignment_heads = getattr(args, "alignment_heads", 1)
args.alignment_layer = getattr(args, "alignment_layer", -1)
base_architecture(args)
if args.alignment_layer < 0:
args.alignment_layer = args.decoder_layers + args.alignment_layer
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en"
)
def transformer_pointer_generator_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
args.encoder_layers = getattr(args, "encoder_layers", 6)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
args.decoder_layers = getattr(args, "decoder_layers", 6)
transformer_pointer_generator(args)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de"
)
def transformer_pointer_generator_wmt_en_de(args):
transformer_pointer_generator(args)
# Transformer pointer-generator with the base Transformer parameters as used in
# the "Attention Is All You Need" paper (Vaswani et al., 2017)
@register_model_architecture(
"transformer_pointer_generator",
"transformer_pointer_generator_vaswani_wmt_en_de_big",
)
def transformer_pointer_generator_vaswani_wmt_en_de_big(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
args.dropout = getattr(args, "dropout", 0.3)
transformer_pointer_generator(args)
@register_model_architecture(
"transformer_pointer_generator",
"transformer_pointer_generator_vaswani_wmt_en_fr_big",
)
def transformer_pointer_generator_vaswani_wmt_en_fr_big(args):
args.dropout = getattr(args, "dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big"
)
def transformer_pointer_generator_wmt_en_de_big(args):
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
# default parameters used in tensor2tensor implementation
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t"
)
def transformer_pointer_generator_wmt_en_de_big_t2t(args):
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
args.activation_dropout = getattr(args, "activation_dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import re
import sys
class OOVIndexError(IndexError):
def __init__(self, pos, source_seq, target_seq):
super(OOVIndexError, self).__init__(
"A <unk-N> tag in the target sequence refers to a position that is "
"outside the source sequence. Most likely there was a mismatch in "
"provided source and target sequences. Otherwise this would mean that "
"the pointing mechanism somehow attended to a position that is past "
"the actual sequence end."
)
self.source_pos = pos
self.source_seq = source_seq
self.target_seq = target_seq
def replace_oovs(source_in, target_in, target_out):
"""Replaces <unk-N> tokens in the target text with the corresponding word in
the source text.
"""
oov_re = re.compile("^<unk-([0-9]+)>$")
for source_seq, target_seq in zip(source_in, target_in):
target_seq_out = []
pos_to_word = source_seq.strip().split()
for token in target_seq.strip().split():
m = oov_re.match(token)
if m:
pos = int(m.group(1))
if pos >= len(pos_to_word):
raise OOVIndexError(pos, source_seq, target_seq)
token_out = pos_to_word[pos]
else:
token_out = token
target_seq_out.append(token_out)
target_out.write(" ".join(target_seq_out) + "\n")
def main():
parser = argparse.ArgumentParser(
description="Replaces <unk-N> tokens in target sequences with words from "
"the corresponding position in the source sequence."
)
parser.add_argument(
"--source", type=str, help="text file with source sequences", required=True
)
parser.add_argument(
"--target", type=str, help="text file with target sequences", required=True
)
parser.add_argument(
"--target-out",
type=str,
help="where to write target sequences without <unk-N> " "entries",
required=True,
)
args = parser.parse_args()
target_in = (
open(args.target, "r", encoding="utf-8") if args.target is not None else None
)
target_out = (
open(args.target_out, "w", encoding="utf-8")
if args.target_out is not None
else None
)
with open(args.source, "r", encoding="utf-8") as source_in, open(
args.target, "r", encoding="utf-8"
) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out:
replace_oovs(source_in, target_in, target_out)
if __name__ == "__main__":
try:
main()
except OOVIndexError as e:
print(e, file=sys.stderr)
print("Source sequence:", e.source_seq.strip(), file=sys.stderr)
print("Target sequence:", e.target_seq.strip(), file=sys.stderr)
print(
"Source sequence length:",
len(e.source_seq.strip().split()),
file=sys.stderr,
)
print("The offending tag points to:", e.source_pos)
sys.exit(2)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from itertools import zip_longest
def replace_oovs(source_in, target_in, vocabulary, source_out, target_out):
"""Replaces out-of-vocabulary words in source and target text with <unk-N>,
where N in is the position of the word in the source sequence.
"""
def format_unk(pos):
return "<unk-{}>".format(pos)
if target_in is None:
target_in = []
for seq_num, (source_seq, target_seq) in enumerate(
zip_longest(source_in, target_in)
):
source_seq_out = []
target_seq_out = []
word_to_pos = dict()
for position, token in enumerate(source_seq.strip().split()):
if token in vocabulary:
token_out = token
else:
if token in word_to_pos:
oov_pos = word_to_pos[token]
else:
word_to_pos[token] = position
oov_pos = position
token_out = format_unk(oov_pos)
source_seq_out.append(token_out)
source_out.write(" ".join(source_seq_out) + "\n")
if target_seq is not None:
for token in target_seq.strip().split():
if token in word_to_pos:
token_out = format_unk(word_to_pos[token])
else:
token_out = token
target_seq_out.append(token_out)
if target_out is not None:
target_out.write(" ".join(target_seq_out) + "\n")
def main():
parser = argparse.ArgumentParser(
description="Replaces out-of-vocabulary words in both source and target "
"sequences with tokens that indicate the position of the word "
"in the source sequence."
)
parser.add_argument(
"--source", type=str, help="text file with source sequences", required=True
)
parser.add_argument(
"--target", type=str, help="text file with target sequences", default=None
)
parser.add_argument("--vocab", type=str, help="vocabulary file", required=True)
parser.add_argument(
"--source-out",
type=str,
help="where to write source sequences with <unk-N> entries",
required=True,
)
parser.add_argument(
"--target-out",
type=str,
help="where to write target sequences with <unk-N> entries",
default=None,
)
args = parser.parse_args()
with open(args.vocab, encoding="utf-8") as vocab:
vocabulary = vocab.read().splitlines()
target_in = (
open(args.target, "r", encoding="utf-8") if args.target is not None else None
)
target_out = (
open(args.target_out, "w", encoding="utf-8")
if args.target_out is not None
else None
)
with open(args.source, "r", encoding="utf-8") as source_in, open(
args.source_out, "w", encoding="utf-8"
) as source_out:
replace_oovs(source_in, target_in, vocabulary, source_out, target_out)
if target_in is not None:
target_in.close()
if target_out is not None:
target_out.close()
if __name__ == "__main__":
main()
# Training with Quantization Noise for Extreme Model Compression ({Fan\*, Stock\*} *et al.*, 2020)
This page contains information for how to train and quantize models with Quantization Noise, for both scalar quantization like `int8` and Iterative Product Quantization.
Check out our paper [here](https://arxiv.org/abs/2004.07320).
Looking for pretrained models? They will be added shortly.
Looking for code to train vision models? We are working on open sourcing our code as part of ClassyVision. Please check back, but note that both the Scalar and Iterative Product Quantization counterparts of the `nn.Conv2d` module are already included in this release.
**Contents**:
- [Walk through of code](#walk-through-the-code)
- [Reproduce NLP Results](#looking-to-reproduce-the-nlp-results-in-the-paper)
- [Reproduce Vision Results](#looking-to-reproduce-the-vision-results-in-the-paper)
## Citation
```bibtex
@article{fan2020training,
title={Training with Quantization Noise for Extreme Model Compression},
author={Angela Fan* and Pierre Stock* and and Benjamin Graham and Edouard Grave and Remi Gribonval and Herve Jegou and Armand Joulin},
year={2020},
eprint={2004.07320},
archivePrefix={arXiv},
primaryClass={cs.ML}
}
```
## Walk through the code
Training a model with Quant-Noise improves the performance in subsequent inference-time quantization by training models to be robust to quantization. This technique is useful for both scalar and product quantization methods, as well as multiple domains. We detail below our approach to train, quantize models and integrate our code to quantize your favorite models.
### Scalar Quantization
Unlike the section [Iterative Product Quantization](#iterative-product-quantization) which gives state-of-the-art compression, this section showcases the usefulness of our approach for simple scalar quantization baselines such as int8 using on-GPU Fake Quantization.
#### Training
Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization).
To train a model with Quant-Noise, add the following flag:
```
--quant-noise-scalar 0.5
```
Large values of noise make the network easier to quantize but may result in higher non-quantized test and validation perplexities.
#### Quantization
When evaluating a network, all quantized modules and activation hooks automatically switch to `p=1` so the validation accuracy reported by Fairseq is actually the quantized one, nothing more to do.
#### Integration with your own code
Looking to quantize your own models with Quant-Noise + Scalar Quantization?
- Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations.
- Then, perform your training as usual. Note that in `eval()` mode, the network is always fully quantized (weights and activations) by default (`p=1`).
### Iterative Product Quantization
Iterative Product Quantization with Quant-Noise proceeds in two steps. First, a model must be trained uncompressed with Quant-Noise. Second, the model must be quantized with iPQ. Note that we implement here the simplest form of noise, which consists in randomly dropping a proportion `p` of blocks, and that worked as well as assigning those blocks to their current centroid.
#### Training
To train a model with Quant-Noise, add the following flags:
```
--quant-noise-pq 0.1 --quant-noise-pq-block-size 8
```
`quant-noise-pq` controls how much dropout is applied to the blocks of the weight matrix. `quant-noise-pq-block-size` controls the size of the weight matrix blocks.
We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy.
We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks.
In the Transformer architectures, quant-noise is applied to the input and output embeddings, the attention, and the FFN.
Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/master/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2.
#### Quantization
We implement an improved version of product quantization from Stock et al, **iPQ**, described [here](https://arxiv.org/abs/1907.05686), see code with old API [here](https://github.com/facebookresearch/kill-the-bits). Note that we improved the iPQ API in terms of both compute speed and usability as described below.
For the particular case of PQ, quantization is made sequentially. We recommend first quantizing the FFNs, then the EMBs, and finally the ATTNs. Quantization is done in two sub-steps:
- First, perform `n` steps of Product Quantization (generally `n=20` is enough).
- Then, finetune the obtained centroids.
#### Integration with your own code
Looking to quantize your own models with Quant-Noise + iPQ?
- First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model.
- Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration.
Note that we tried our approach only on Transformers and various Convolutional Models such as EfficientNets.
```python
from fairseq.modules.quantization.pq import quantize_model_, SizeTracker
# get configuration parameters
n_centroids_config = config["n_centroids"]
block_sizes_config = config["block_sizes"]
layers_to_quantize = config["layers_to_quantize"]
# size tracker for keeping track of assignments, centroids and non-compressed sizes
size_tracker = SizeTracker(model)
# Quantize model by stages
for step in range(len(layers_to_quantize)):
# quantize model in-place
quantized_layers = quantize_model_(
model,
size_tracker,
layers_to_quantize,
block_sizes_config,
n_centroids_config,
step=step,
)
logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
logger.info(f"{size_tracker}")
# Don't forget to re-create/update trainer/optimizer since model parameters have changed
optimizer = ...
# Finetune the centroids with your usual training loop for a few epochs
trainer.train_epoch()
```
## Looking to reproduce the NLP results in the paper?
We detail below how to reproduce the state-of-the-art results in reported in the paper for Quant-Noise + Iterative Product Quantization.
### Training with Quant-Noise
To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
The following command can be used to train a RoBERTa Base + QuantNoise model:
```bash
TOTAL_UPDATES=125000
WARMUP_UPDATES=10000
PEAK_LR=0.0005
TOKENS_PER_SAMPLE=512
MAX_POSITIONS=512
MAX_SENTENCES=16
UPDATE_FREQ=2
DATA_DIR=/path/to/data/here
fairseq-train $DATA_DIR \
--task masked_lm --criterion masked_lm --arch roberta_base \
--sample-break-mode complete \
--tokens-per-sample $TOKENS_PER_SAMPLE --max-positions $MAX_POSITIONS \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $PEAK_LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 \
--batch-size $MAX_SENTENCES \
--update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \
--save-dir checkpoint/roberta \
--ddp-backend legacy_ddp --encoder-layerdrop 0.2 \
--quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta
```
To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md).
The following command can be used to finetune a RoBERTa Base + QuantNoise model on the RTE dataset:
```bash
TOTAL_NUM_UPDATES=2036
WARMUP_UPDATES=122
LR=2e-05
NUM_CLASSES=2
MAX_SENTENCES=16
ROBERTA_PATH=/path/to/roberta_quantnoise/model.pt
fairseq-train /path/to/rte/data/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--ddp-backend legacy_ddp \
--quant-noise-pq 0.2 --quant-noise-pq-block-size 8
```
To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model).
The following command can be used to train a Transformer + QuantNoise model on Wikitext-103:
```bash
fairseq-train --task language_modeling /path/to/wikitext-103/data \
--save-dir checkpoints/transformer_wikitext-103 \
--adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
--adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
--tie-adaptive-proj --tie-adaptive-weights \
--arch transformer_lm_gbw \
--attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
--clip-norm 0.1 --criterion adaptive_loss \
--ddp-backend legacy_ddp \
--decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \
--decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
--min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \
--max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \
--sample-break-mode none --update-freq 3 \
--warmup-init-lr 1e-07 --warmup-updates 16000 \
--weight-decay 0 --seed 1 --stop-min-lr 1e-09 \
--quant-noise-pq 0.05 --quant-noise-pq-block-size 8
```
To **evaluate** this model, note you need to use the `eval.py` script. The following command can be used to evaluate:
```bash
fairseq-eval-lm /path/to/wikitext-103/data --path /path/to/model/checkpoint \
--sample-break-mode complete \
--max-tokens 3072 \
--context-window 2560 \
--softmax-batch 1024 \
--gen-subset valid
```
and change the `--gen-subset` to `test` if you would like to evaluate on the test set instead.
### Iterative Product Quantization
To quantize the finetuned RoBERTa model, we use this command on 1 GPU. This should run in a day.
```bash
TOTAL_NUM_UPDATES=6108 # 2036 updates for each iteration
WARMUP_UPDATES=122
LR=2e-05
NUM_CLASSES=2
MAX_SENTENCES=16
fairseq-train --task sentence_prediction /path/to/data/ \
--restore-file $ROBERTA_PATH \
--save-dir checkpoints/roberta_finetuned \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 --lr-scheduler polynomial_decay \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \
--quantization-config-path /path/to/config/yaml
```
To quantize the trained Language Model, we use this command on 8 V100 23GB GPUs. This should run in a couple of hours.
```bash
fairseq-train --task language_modeling /path/to/wikitext-103/data \
--save-dir checkpoints/transformer_wikitext-103 \
--adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
--adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
--arch transformer_lm_gbw \
--attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
--bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \
--clip-norm 0.1 --criterion adaptive_loss \
--ddp-backend legacy_ddp \
--decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
--fp16 --keep-last-epochs -1 \
--min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \
--max-tokens 2944 --tokens-per-sample 2944\
--momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \
--sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \
--tie-adaptive-proj --tie-adaptive-weights --update-freq 3 --weight-decay 0 --seed 1 \
--log-interval 100 --no-progress-bar --skip-invalid-size-inputs-valid-test \
--restore-file path/to/trained/lm/with/quant/noise \
--max-update 13500 --quantization-config-path /path/to/config/yaml
```
If you have less capacity or if your distributed training freezes, try reducing `--max-tokens` and `--tokens-per-sample` (this may reduce the quantized accuracy a bit).
### Remarks
We try to keep the open-sourced code as readable and as easy-to-plug as possible. Therefore, we did not test it for the following cases:
- Scalar quantization with RoBERTa.
- Quantization with iPQ and `int8` combined.
If you have trouble adapting it, we will be more than happy to help!
## Looking to reproduce the Vision results in the paper?
We are working on open sourcing our code as part of ClassyVision. Please check back.
## Having an issue or have a question?
Please open an issue in this repository with the details of your question. Thanks!
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# This file defines example configuration arguments for quantizing
# a transformer model with product quantization
# Number of Centroids for Product Quantization, by default 256 (byte-aligned)
n_centroids:
Linear:
key: in_features
value: {"*": 256}
Embedding:
key: embedding_dim
value: {"*": 256}
# Block Sizes for Product Quantization
# We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings
block_sizes:
Linear:
key: fuzzy_name
value: {fc: 8, attn: 4, emb: 4}
Embedding:
key: fuzzy_name
value: {emb: 8}
# Layers to Quantize Sequentially
# We suggest: first FFN, then EMB, then ATTN
layers_to_quantize:
- decoder\\.layers\\.\d+\\.fc[12]
- decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]
- decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)
# Finetuning RoBERTa on a custom classification task
This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks.
### 1) Get the data
```bash
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar zxvf aclImdb_v1.tar.gz
```
### 2) Format data
`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.
```python
import argparse
import os
import random
from glob import glob
random.seed(0)
def main(args):
for split in ['train', 'test']:
samples = []
for class_label in ['pos', 'neg']:
fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt')
for fname in fnames:
with open(fname) as fin:
line = fin.readline()
samples.append((line, 1 if class_label == 'pos' else 0))
random.shuffle(samples)
out_fname = 'train' if split == 'train' else 'dev'
f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w')
f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w')
for sample in samples:
f1.write(sample[0] + '\n')
f2.write(str(sample[1]) + '\n')
f1.close()
f2.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--datadir', default='aclImdb')
args = parser.parse_args()
main(args)
```
### 3) BPE encode
Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
```bash
# Download encoder.json and vocab.bpe
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
for SPLIT in train dev; do
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "aclImdb/$SPLIT.input0" \
--outputs "aclImdb/$SPLIT.input0.bpe" \
--workers 60 \
--keep-empty
done
```
### 4) Preprocess data
```bash
# Download fairseq dictionary.
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
fairseq-preprocess \
--only-source \
--trainpref "aclImdb/train.input0.bpe" \
--validpref "aclImdb/dev.input0.bpe" \
--destdir "IMDB-bin/input0" \
--workers 60 \
--srcdict dict.txt
fairseq-preprocess \
--only-source \
--trainpref "aclImdb/train.label" \
--validpref "aclImdb/dev.label" \
--destdir "IMDB-bin/label" \
--workers 60
```
### 5) Run training
```bash
TOTAL_NUM_UPDATES=7812 # 10 epochs through IMDB for bsz 32
WARMUP_UPDATES=469 # 6 percent of the number of updates
LR=1e-05 # Peak LR for polynomial LR scheduler.
HEAD_NAME=imdb_head # Custom name for the classification head.
NUM_CLASSES=2 # Number of classes for the classification task.
MAX_SENTENCES=8 # Batch size.
ROBERTA_PATH=/path/to/roberta.large/model.pt
CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--classification-head-name $HEAD_NAME \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--shorten-method "truncate" \
--find-unused-parameters \
--update-freq 4
```
The above command will finetune RoBERTa-large with an effective batch-size of 32
sentences (`--batch-size=8 --update-freq=4`). The expected
`best-validation-accuracy` after 10 epochs is ~96.5%.
If you run out of GPU memory, try decreasing `--batch-size` and increase
`--update-freq` to compensate.
### 6) Load model using hub interface
Now we can load the trained model checkpoint using the RoBERTa hub interface.
Assuming your checkpoints are stored in `checkpoints/`:
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained(
'checkpoints',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='IMDB-bin'
)
roberta.eval() # disable dropout
```
Finally you can make predictions using the `imdb_head` (or whatever you set
`--classification-head-name` to during training):
```python
label_fn = lambda label: roberta.task.label_dictionary.string(
[label + roberta.task.label_dictionary.nspecial]
)
tokens = roberta.encode('Best movie this year')
pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
assert pred == '1' # positive
tokens = roberta.encode('Worst movie ever')
pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
assert pred == '0' # negative
```
# Finetuning RoBERTa on GLUE tasks
### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
```bash
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
python download_glue_data.py --data_dir glue_data --tasks all
```
### 2) Preprocess GLUE task data:
```bash
./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
```
`glue_task_name` is one of the following:
`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
Use `ALL` for preprocessing all the glue tasks.
### 3) Fine-tuning on GLUE task:
Example fine-tuning cmd for `RTE` task
```bash
TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16
WARMUP_UPDATES=122 # 6 percent of the number of updates
LR=2e-05 # Peak LR for polynomial LR scheduler.
NUM_CLASSES=2
MAX_SENTENCES=16 # Batch size.
ROBERTA_PATH=/path/to/roberta/model.pt
CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
```
For each of the GLUE task, you will need to use following cmd-line arguments:
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
`--batch-size` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
**Note:**
a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=16/32` depending on the task.
b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.
### Inference on GLUE task
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained(
'checkpoints/',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='RTE-bin'
)
label_fn = lambda label: roberta.task.label_dictionary.string(
[label + roberta.task.label_dictionary.nspecial]
)
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('glue_data/RTE/dev.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[1], tokens[2], tokens[3]
tokens = roberta.encode(sent1, sent2)
prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
prediction_label = label_fn(prediction)
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
```
# RoBERTa: A Robustly Optimized BERT Pretraining Approach
https://arxiv.org/abs/1907.11692
## Introduction
RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
### What's New:
- December 2020: German model (GottBERT) is available: [GottBERT](https://github.com/pytorch/fairseq/tree/master/examples/gottbert).
- January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto).
- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert).
- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
- September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers).
- August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers).
- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset).
- August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
## Pre-trained models
Model | Description | # params | Download
---|---|---|---
`roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
`roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
`roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
`roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
## Results
**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
_(dev set, single model, single-task finetuning)_
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
**[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)**
_(dev set, single model, single-task finetuning)_
Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
---|---|---|---|---|---|---|---
`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | -
`roberta.large.wsc` | - | - | - | - | - | - | 91.3
**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
_(dev set, no additional data used)_
Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
---|---|---
`roberta.large` | 88.9/94.6 | 86.5/89.4
**[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)**
_(test set)_
Model | Accuracy | Middle | High
---|---|---|---
`roberta.large` | 83.2 | 86.5 | 81.3
**[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)**
_(test set)_
Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow
---|---|---|---|---|---
`roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9
**[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)**
_(test set)_
Model | Accuracy
---|---
`roberta.large` (single model) | 72.1
`roberta.large` (ensemble) | 72.5
**[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)**
_(test set)_
Model | Accuracy
---|---
`roberta.large` | 78.1
**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)**
_(TRANSLATE-TEST)_
Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81
## Example usage
##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
```python
import torch
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
roberta.eval() # disable dropout (or leave in train mode to finetune)
```
##### Load RoBERTa (for PyTorch 1.0 or custom models):
```python
# Download roberta.large model
wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
tar -xzvf roberta.large.tar.gz
# Load the model in fairseq
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt')
roberta.eval() # disable dropout (or leave in train mode to finetune)
```
##### Apply Byte-Pair Encoding (BPE) to input text:
```python
tokens = roberta.encode('Hello world!')
assert tokens.tolist() == [0, 31414, 232, 328, 2]
roberta.decode(tokens) # 'Hello world!'
```
##### Extract features from RoBERTa:
```python
# Extract the last layer's features
last_layer_features = roberta.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 5, 1024])
# Extract all layer's features (layer 0 is the embedding layer)
all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 25
assert torch.all(all_layers[-1] == last_layer_features)
```
##### Use RoBERTa for sentence-pair classification tasks:
```python
# Download RoBERTa already finetuned for MNLI
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval() # disable dropout for evaluation
# Encode a pair of sentences and make a prediction
tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
roberta.predict('mnli', tokens).argmax() # 0: contradiction
# Encode another pair of sentences
tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')
roberta.predict('mnli', tokens).argmax() # 2: entailment
```
##### Register a new (randomly initialized) classification head:
```python
roberta.register_classification_head('new_task', num_classes=3)
logprobs = roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
```
##### Batched prediction:
```python
import torch
from fairseq.data.data_utils import collate_tokens
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()
batch_of_pairs = [
['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
['potatoes are awesome.', 'I like to run.'],
['Mars is very far from earth.', 'Mars is very close.'],
]
batch = collate_tokens(
[roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)
logprobs = roberta.predict('mnli', batch)
print(logprobs.argmax(dim=1))
# tensor([0, 2, 1, 0])
```
##### Using the GPU:
```python
roberta.cuda()
roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
```
## Advanced usage
#### Filling masks:
RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the
[Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/):
```python
roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3)
# [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')]
roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3)
# [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')]
roberta.fill_mask('<mask> is the common currency of the European Union', topk=3)
# [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')]
```
#### Pronoun disambiguation (Winograd Schema Challenge):
RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model:
```bash
pip install spacy
python -m spacy download en_core_web_lg
```
Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun`
function. The pronoun should be surrounded by square brackets (`[]`) and the
query referent surrounded by underscores (`_`), or left blank to return the
predicted candidate text directly:
```python
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')
roberta.cuda() # use the GPU (optional)
roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
# True
roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.')
# False
roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.')
# 'The city councilmen'
roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.')
# 'demonstrators'
```
See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model.
#### Extract features aligned to words:
By default RoBERTa outputs one feature vector per BPE token. You can instead
realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
with the `extract_features_aligned_to_words` method. This will compute a
weighted average of the BPE-level features for each word and expose them in
spaCy's `Token.vector` attribute:
```python
doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
assert len(doc) == 10
for tok in doc:
print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
# <s> tensor([-0.1316, -0.0386, -0.0832, -0.0477, 0.1943], grad_fn=<SliceBackward>) (...)
# I tensor([ 0.0559, 0.1541, -0.4832, 0.0880, 0.0120], grad_fn=<SliceBackward>) (...)
# said tensor([-0.1565, -0.0069, -0.8915, 0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
# , tensor([-0.1318, -0.0387, -0.0834, -0.0477, 0.1944], grad_fn=<SliceBackward>) (...)
# " tensor([-0.0486, 0.1818, -0.3946, -0.0553, 0.0981], grad_fn=<SliceBackward>) (...)
# hello tensor([ 0.0079, 0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
# RoBERTa tensor([-0.2339, -0.1184, -0.7343, -0.0492, 0.5829], grad_fn=<SliceBackward>) (...)
# . tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...)
# " tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...)
# </s> tensor([-0.0930, -0.0392, -0.0821, 0.0158, 0.0649], grad_fn=<SliceBackward>) (...)
```
#### Evaluating the `roberta.large.mnli` model:
Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
```python
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('glue_data/MNLI/dev_matched.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
tokens = roberta.encode(sent1, sent2)
prediction = roberta.predict('mnli', tokens).argmax().item()
prediction_label = label_map[prediction]
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Expected output: 0.9060
```
## Finetuning
- [Finetuning on GLUE](README.glue.md)
- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
## Pretraining using your own data
See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
## Citation
```bibtex
@article{liu2019roberta,
title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
Luke Zettlemoyer and Veselin Stoyanov},
journal={arXiv preprint arXiv:1907.11692},
year = {2019},
}
```
# Pretraining RoBERTa using your own data
This tutorial will walk you through pretraining RoBERTa over your own data.
### 1) Preprocess the data
Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training.
We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)
to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course
this dataset is quite small, so the resulting pretrained model will perform
poorly, but it gives the general idea.
First download the dataset:
```bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
```
Next encode it with the GPT-2 BPE:
```bash
mkdir -p gpt2_bpe
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
for SPLIT in train valid test; do \
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json gpt2_bpe/encoder.json \
--vocab-bpe gpt2_bpe/vocab.bpe \
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
--keep-empty \
--workers 60; \
done
```
Finally preprocess/binarize the data using the GPT-2 fairseq dictionary:
```bash
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
fairseq-preprocess \
--only-source \
--srcdict gpt2_bpe/dict.txt \
--trainpref wikitext-103-raw/wiki.train.bpe \
--validpref wikitext-103-raw/wiki.valid.bpe \
--testpref wikitext-103-raw/wiki.test.bpe \
--destdir data-bin/wikitext-103 \
--workers 60
```
### 2) Train RoBERTa base
```bash
TOTAL_UPDATES=125000 # Total number of training steps
WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates
PEAK_LR=0.0005 # Peak learning rate, adjust as needed
TOKENS_PER_SAMPLE=512 # Max sequence length
MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)
MAX_SENTENCES=16 # Number of sequences per batch (batch size)
UPDATE_FREQ=16 # Increase the batch size 16x
DATA_DIR=data-bin/wikitext-103
fairseq-train --fp16 $DATA_DIR \
--task masked_lm --criterion masked_lm \
--arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
--optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--batch-size $MAX_SENTENCES --update-freq $UPDATE_FREQ \
--max-update $TOTAL_UPDATES --log-format simple --log-interval 1
```
**Note:** You can optionally resume training the released RoBERTa base model by
adding `--restore-file /path/to/roberta.base/model.pt`.
**Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses
a batch size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to
further increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size
of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need
to reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate.
Alternatively if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly
to increase training speed.
**Note:** The learning rate and batch size are tightly connected and need to be
adjusted together. We generally recommend increasing the learning rate as you
increase the batch size according to the following table (although it's also
dataset dependent, so don't rely on the following values too closely):
batch size | peak learning rate
---|---
256 | 0.0001
2048 | 0.0005
8192 | 0.0007
### 3) Load your pretrained model
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data')
assert isinstance(roberta.model, torch.nn.Module)
```
# Finetuning RoBERTa on RACE tasks
### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/)
### 2) Preprocess RACE data:
```bash
python ./examples/roberta/preprocess_RACE.py --input-dir <input-dir> --output-dir <extracted-data-dir>
./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
```
### 3) Fine-tuning on RACE:
```bash
MAX_EPOCH=5 # Number of training epochs.
LR=1e-05 # Peak LR for fixed LR scheduler.
NUM_CLASSES=4
MAX_SENTENCES=1 # Batch size per GPU.
UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs.
DATA_DIR=/path/to/race-output-dir
ROBERTA_PATH=/path/to/roberta/model.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--task sentence_ranking \
--num-classes $NUM_CLASSES \
--init-token 0 --separator-token 2 \
--max-option-length 128 \
--max-positions 512 \
--shorten-method "truncate" \
--arch roberta_large \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion sentence_ranking \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler fixed --lr $LR \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--batch-size $MAX_SENTENCES \
--required-batch-size-multiple 1 \
--update-freq $UPDATE_FREQ \
--max-epoch $MAX_EPOCH
```
**Note:**
a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size.
b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.
### 4) Evaluation:
```
DATA_DIR=/path/to/race-output-dir # data directory used during training
MODEL_PATH=/path/to/checkpoint_best.pt # path to the finetuned model checkpoint
PREDS_OUT=preds.tsv # output file path to save prediction
TEST_SPLIT=test # can be test (Middle) or test1 (High)
fairseq-validate \
$DATA_DIR \
--valid-subset $TEST_SPLIT \
--path $MODEL_PATH \
--batch-size 1 \
--task sentence_ranking \
--criterion sentence_ranking \
--save-predictions $PREDS_OUT
```
# Finetuning RoBERTa on Commonsense QA
We follow a similar approach to [finetuning RACE](../README.race.md). Specifically
for each question we construct five inputs, one for each of the five candidate
answer choices. Each input is constructed by concatenating the question and
candidate answer. We then encode each input and pass the resulting "[CLS]"
representations through a fully-connected layer to predict the correct answer.
We train with a standard cross-entropy loss.
We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
the answer. The complete input format is:
```
<s> Q: Where would I not want a fox? </s> A: hen house </s>
```
Our final submission is based on a hyperparameter search over the learning rate
(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000,
4000) and random seed. We selected the model with the best performance on the
development set after 100 trials.
### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
```bash
bash examples/roberta/commonsense_qa/download_cqa_data.sh
```
### 2) Finetune
```bash
MAX_UPDATES=3000 # Number of training steps.
WARMUP_UPDATES=150 # Linearly increase LR over this many steps.
LR=1e-05 # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=16 # Batch size.
SEED=1 # Random seed.
ROBERTA_PATH=/path/to/roberta/model.pt
DATA_DIR=data/CommonsenseQA
# we use the --user-dir option to load the task from
# the examples/roberta/commonsense_qa directory:
FAIRSEQ_PATH=/path/to/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \
$DATA_DIR \
--user-dir $FAIRSEQ_USER_DIR \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--task commonsense_qa --init-token 0 --bpe gpt2 \
--arch roberta_large --max-positions 512 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion sentence_ranking --num-classes 5 \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
--batch-size $MAX_SENTENCES \
--max-update $MAX_UPDATES \
--log-format simple --log-interval 25 \
--seed $SEED
```
The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with
less memory, decrease `--batch-size` and increase `--update-freq`
accordingly to compensate.
### 3) Evaluate
```python
import json
import torch
from fairseq.models.roberta import RobertaModel
from examples.roberta import commonsense_qa # load the Commonsense QA task
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA')
roberta.eval() # disable dropout
roberta.cuda() # use the GPU (optional)
nsamples, ncorrect = 0, 0
with open('data/CommonsenseQA/valid.jsonl') as h:
for line in h:
example = json.loads(line)
scores = []
for choice in example['question']['choices']:
input = roberta.encode(
'Q: ' + example['question']['stem'],
'A: ' + choice['text'],
no_separator=True
)
score = roberta.predict('sentence_classification_head', input, return_logits=True)
scores.append(score)
pred = torch.cat(scores).argmax()
answer = ord(example['answerKey']) - ord('A')
nsamples += 1
if pred == answer:
ncorrect += 1
print('Accuracy: ' + str(ncorrect / float(nsamples)))
# Accuracy: 0.7846027846027847
```
The above snippet is not batched, which makes it quite slow. See [instructions
for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction).
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import commonsense_qa_task # noqa
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import json
import os
import numpy as np
import torch
from fairseq.data import (
Dictionary,
IdDataset,
ListDataset,
NestedDictionaryDataset,
NumelDataset,
NumSamplesDataset,
RawLabelDataset,
RightPadDataset,
SortDataset,
data_utils,
encoders,
)
from fairseq.tasks import LegacyFairseqTask, register_task
@register_task("commonsense_qa")
class CommonsenseQATask(LegacyFairseqTask):
"""Task to finetune RoBERTa for Commonsense QA."""
@staticmethod
def add_args(parser):
"""Add task-specific arguments to the parser."""
parser.add_argument(
"data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
)
parser.add_argument(
"--init-token",
type=int,
default=None,
help="add token at the beginning of each batch item",
)
parser.add_argument("--num-classes", type=int, default=5)
def __init__(self, args, vocab):
super().__init__(args)
self.vocab = vocab
self.mask = vocab.add_symbol("<mask>")
self.bpe = encoders.build_bpe(args)
@classmethod
def load_dictionary(cls, filename):
"""Load the dictionary from the filename
Args:
filename (str): the filename
"""
dictionary = Dictionary.load(filename)
dictionary.add_symbol("<mask>")
return dictionary
@classmethod
def setup_task(cls, args, **kwargs):
assert (
args.criterion == "sentence_ranking"
), "Must set --criterion=sentence_ranking"
# load data and label dictionaries
vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
print("| dictionary: {} types".format(len(vocab)))
return cls(args, vocab)
def load_dataset(
self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
):
"""Load a given dataset split.
Args:
split (str): name of the split (e.g., train, valid, test)
"""
def binarize(s, append_bos=False):
if self.bpe is not None:
s = self.bpe.encode(s)
tokens = self.vocab.encode_line(
s,
append_eos=True,
add_if_not_exist=False,
).long()
if append_bos and self.args.init_token is not None:
tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
return tokens
if data_path is None:
data_path = os.path.join(self.args.data, split + ".jsonl")
if not os.path.exists(data_path):
raise FileNotFoundError("Cannot find data: {}".format(data_path))
src_tokens = [[] for i in range(self.args.num_classes)]
src_lengths = [[] for i in range(self.args.num_classes)]
labels = []
with open(data_path) as h:
for line in h:
example = json.loads(line.strip())
if "answerKey" in example:
label = ord(example["answerKey"]) - ord("A")
labels.append(label)
question = example["question"]["stem"]
assert len(example["question"]["choices"]) == self.args.num_classes
# format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>`
question = "Q: " + question
question_toks = binarize(question, append_bos=True)
for i, choice in enumerate(example["question"]["choices"]):
src = "A: " + choice["text"]
src_bin = torch.cat([question_toks, binarize(src)])
src_tokens[i].append(src_bin)
src_lengths[i].append(len(src_bin))
assert all(
len(src_tokens[0]) == len(src_tokens[i])
for i in range(self.args.num_classes)
)
assert len(src_tokens[0]) == len(src_lengths[0])
assert len(labels) == 0 or len(labels) == len(src_tokens[0])
for i in range(self.args.num_classes):
src_lengths[i] = np.array(src_lengths[i])
src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i])
src_lengths[i] = ListDataset(src_lengths[i])
dataset = {
"id": IdDataset(),
"nsentences": NumSamplesDataset(),
"ntokens": NumelDataset(src_tokens[0], reduce=True),
}
for i in range(self.args.num_classes):
dataset.update(
{
"net_input{}".format(i + 1): {
"src_tokens": RightPadDataset(
src_tokens[i],
pad_idx=self.source_dictionary.pad(),
),
"src_lengths": src_lengths[i],
}
}
)
if len(labels) > 0:
dataset.update({"target": RawLabelDataset(labels)})
dataset = NestedDictionaryDataset(
dataset,
sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
)
with data_utils.numpy_seed(self.args.seed):
dataset = SortDataset(
dataset,
# shuffle
sort_order=[np.random.permutation(len(dataset))],
)
print("| Loaded {} with {} samples".format(split, len(dataset)))
self.datasets[split] = dataset
return self.datasets[split]
def build_model(self, args):
from fairseq import models
model = models.build_model(args, self)
model.register_classification_head(
"sentence_classification_head",
num_classes=1,
)
return model
@property
def source_dictionary(self):
return self.vocab
@property
def target_dictionary(self):
return self.vocab
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
OUTDIR=data/CommonsenseQA
mkdir -p $OUTDIR
wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import contextlib
import sys
from collections import Counter
from multiprocessing import Pool
from fairseq.data.encoders.gpt2_bpe import get_encoder
def main():
"""
Helper script to encode raw text with the GPT-2 BPE using multiple processes.
The encoder.json and vocab.bpe files can be obtained here:
- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--encoder-json",
help="path to encoder.json",
)
parser.add_argument(
"--vocab-bpe",
type=str,
help="path to vocab.bpe",
)
parser.add_argument(
"--inputs",
nargs="+",
default=["-"],
help="input files to filter/encode",
)
parser.add_argument(
"--outputs",
nargs="+",
default=["-"],
help="path to save encoded outputs",
)
parser.add_argument(
"--keep-empty",
action="store_true",
help="keep empty lines",
)
parser.add_argument("--workers", type=int, default=20)
args = parser.parse_args()
assert len(args.inputs) == len(
args.outputs
), "number of input and output paths should match"
with contextlib.ExitStack() as stack:
inputs = [
stack.enter_context(open(input, "r", encoding="utf-8"))
if input != "-"
else sys.stdin
for input in args.inputs
]
outputs = [
stack.enter_context(open(output, "w", encoding="utf-8"))
if output != "-"
else sys.stdout
for output in args.outputs
]
encoder = MultiprocessingEncoder(args)
pool = Pool(args.workers, initializer=encoder.initializer)
encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)
stats = Counter()
for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
if filt == "PASS":
for enc_line, output_h in zip(enc_lines, outputs):
print(enc_line, file=output_h)
else:
stats["num_filtered_" + filt] += 1
if i % 10000 == 0:
print("processed {} lines".format(i), file=sys.stderr)
for k, v in stats.most_common():
print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
class MultiprocessingEncoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
global bpe
bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
def encode(self, line):
global bpe
ids = bpe.encode(line)
return list(map(str, ids))
def decode(self, tokens):
global bpe
return bpe.decode(tokens)
def encode_lines(self, lines):
"""
Encode a set of lines. All lines will be encoded together.
"""
enc_lines = []
for line in lines:
line = line.strip()
if len(line) == 0 and not self.args.keep_empty:
return ["EMPTY", None]
tokens = self.encode(line)
enc_lines.append(" ".join(tokens))
return ["PASS", enc_lines]
def decode_lines(self, lines):
dec_lines = []
for line in lines:
tokens = map(int, line.strip().split())
dec_lines.append(self.decode(tokens))
return ["PASS", dec_lines]
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment