Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
import argparse
from pathlib import Path
import soundfile
def get_insl_frame(parse):
out = []
def is_ont_token(tok):
return tok[0] in ["[", "]"];
res = []
x = []
for tok in parse.split():
if is_ont_token(tok):
res.extend('_'.join(x))
x = []
res.append(tok.upper())
else:
x.append(tok.upper())
return " ".join(res) + ' | '
def sequencify_utterance(utterance):
utterance = utterance.upper()
utterance = utterance.replace(' ', '|') + '|'
utterance = list(utterance)
utterance = ' '.join(utterance)
return utterance
def generate_fairseq_manifests(manifest, output_path, audio_root=None):
with open(manifest, 'r') as i:
parses = []
utterances = []
filepaths = []
keys = None
for (idx, line) in enumerate(i):
if idx == 0: keys = line.strip().split('\t')
else:
data = { k: v for (k, v) in zip(keys, line.split('\t'))}
parses.append(get_insl_frame(data['decoupled_normalized_seqlogical']))
utterances.append(sequencify_utterance(data['normalized_utterance']))
filepaths.append(data['file_id'])
parses_fp = output_path.with_suffix('.parse')
with open(str(parses_fp), 'w') as o:
for p in parses:
o.write(p + '\n')
utterances_fp = output_path.with_suffix('.ltr')
with open(str(utterances_fp), 'w') as o:
for u in utterances:
o.write(u + '\n')
filepaths_fp = output_path.with_suffix('.tsv')
with open(str(filepaths_fp), 'w') as o:
o.write(str(audio_root) + '\n')
for f in filepaths:
fullpath = audio_root / f
assert fullpath.exists(), f'{fullpath}'
frames = soundfile.info(fullpath).frames
o.write(f'{f}\t{frames}\n')
def main(args):
splits = ['train', 'eval', 'test']
root = Path(args.stop_root)
output_root = Path(args.output)
for split in splits:
stop_manifest_path = root / 'manifests' / (split + '.tsv')
output_path = output_root / (split)
generate_fairseq_manifests(stop_manifest_path, output_path, root)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--stop_root', type=str,
help='path to stop root directory')
parser.add_argument('--output', type=str,
help='output directory')
args = parser.parse_args()
main(args)
# Understanding Back-Translation at Scale (Edunov et al., 2018)
This page includes pre-trained models from the paper [Understanding Back-Translation at Scale (Edunov et al., 2018)](https://arxiv.org/abs/1808.09381).
## Pre-trained models
Model | Description | Dataset | Download
---|---|---|---
`transformer.wmt18.en-de` | Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381)) <br> WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) <br> See NOTE in the archive
## Example usage (torch.hub)
We require a few additional Python dependencies for preprocessing:
```bash
pip install subword_nmt sacremoses
```
Then to generate translations from the full model ensemble:
```python
import torch
# List available models
torch.hub.list('pytorch/fairseq') # [..., 'transformer.wmt18.en-de', ... ]
# Load the WMT'18 En-De ensemble
en2de_ensemble = torch.hub.load(
'pytorch/fairseq', 'transformer.wmt18.en-de',
checkpoint_file='wmt18.model1.pt:wmt18.model2.pt:wmt18.model3.pt:wmt18.model4.pt:wmt18.model5.pt',
tokenizer='moses', bpe='subword_nmt')
# The ensemble contains 5 models
len(en2de_ensemble.models)
# 5
# Translate
en2de_ensemble.translate('Hello world!')
# 'Hallo Welt!'
```
## Training your own model (WMT'18 English-German)
The following instructions can be adapted to reproduce the models from the paper.
#### Step 1. Prepare parallel data and optionally train a baseline (English-German) model
First download and preprocess the data:
```bash
# Download and prepare the data
cd examples/backtranslation/
bash prepare-wmt18en2de.sh
cd ../..
# Binarize the data
TEXT=examples/backtranslation/wmt18_en_de
fairseq-preprocess \
--joined-dictionary \
--source-lang en --target-lang de \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--destdir data-bin/wmt18_en_de --thresholdtgt 0 --thresholdsrc 0 \
--workers 20
# Copy the BPE code into the data-bin directory for future use
cp examples/backtranslation/wmt18_en_de/code data-bin/wmt18_en_de/code
```
(Optionally) Train a baseline model (English-German) using just the parallel data:
```bash
CHECKPOINT_DIR=checkpoints_en_de_parallel
fairseq-train --fp16 \
data-bin/wmt18_en_de \
--source-lang en --target-lang de \
--arch transformer_wmt_en_de_big --share-all-embeddings \
--dropout 0.3 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
--max-tokens 3584 --update-freq 16 \
--max-update 30000 \
--save-dir $CHECKPOINT_DIR
# Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a
# different number of GPUs.
```
Average the last 10 checkpoints:
```bash
python scripts/average_checkpoints.py \
--inputs $CHECKPOINT_DIR \
--num-epoch-checkpoints 10 \
--output $CHECKPOINT_DIR/checkpoint.avg10.pt
```
Evaluate BLEU:
```bash
# tokenized BLEU on newstest2017:
bash examples/backtranslation/tokenized_bleu.sh \
wmt17 \
en-de \
data-bin/wmt18_en_de \
data-bin/wmt18_en_de/code \
$CHECKPOINT_DIR/checkpoint.avg10.pt
# BLEU4 = 29.57, 60.9/35.4/22.9/15.5 (BP=1.000, ratio=1.014, syslen=63049, reflen=62152)
# compare to 29.46 in Table 1, which is also for tokenized BLEU
# generally it's better to report (detokenized) sacrebleu though:
bash examples/backtranslation/sacrebleu.sh \
wmt17 \
en-de \
data-bin/wmt18_en_de \
data-bin/wmt18_en_de/code \
$CHECKPOINT_DIR/checkpoint.avg10.pt
# BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 29.0 60.6/34.7/22.4/14.9 (BP = 1.000 ratio = 1.013 hyp_len = 62099 ref_len = 61287)
```
#### Step 2. Back-translate monolingual German data
Train a reverse model (German-English) to do the back-translation:
```bash
CHECKPOINT_DIR=checkpoints_de_en_parallel
fairseq-train --fp16 \
data-bin/wmt18_en_de \
--source-lang de --target-lang en \
--arch transformer_wmt_en_de_big --share-all-embeddings \
--dropout 0.3 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
--max-tokens 3584 --update-freq 16 \
--max-update 30000 \
--save-dir $CHECKPOINT_DIR
# Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a
# different number of GPUs.
```
Let's evaluate the back-translation (BT) model to make sure it is well trained:
```bash
bash examples/backtranslation/sacrebleu.sh \
wmt17 \
de-en \
data-bin/wmt18_en_de \
data-bin/wmt18_en_de/code \
$CHECKPOINT_DIR/checkpoint_best.py
# BLEU+case.mixed+lang.de-en+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 34.9 66.9/41.8/28.5/19.9 (BP = 0.983 ratio = 0.984 hyp_len = 63342 ref_len = 64399)
# compare to the best system from WMT'17 which scored 35.1: http://matrix.statmt.org/matrix/systems_list/1868
```
Next prepare the monolingual data:
```bash
# Download and prepare the monolingual data
# By default the script samples 25M monolingual sentences, which after
# deduplication should be just over 24M sentences. These are split into 25
# shards, each with 1M sentences (except for the last shard).
cd examples/backtranslation/
bash prepare-de-monolingual.sh
cd ../..
# Binarize each shard of the monolingual data
TEXT=examples/backtranslation/wmt18_de_mono
for SHARD in $(seq -f "%02g" 0 24); do \
fairseq-preprocess \
--only-source \
--source-lang de --target-lang en \
--joined-dictionary \
--srcdict data-bin/wmt18_en_de/dict.de.txt \
--testpref $TEXT/bpe.monolingual.dedup.${SHARD} \
--destdir data-bin/wmt18_de_mono/shard${SHARD} \
--workers 20; \
cp data-bin/wmt18_en_de/dict.en.txt data-bin/wmt18_de_mono/shard${SHARD}/; \
done
```
Now we're ready to perform back-translation over the monolingual data. The
following command generates via sampling, but it's possible to use greedy
decoding (`--beam 1`), beam search (`--beam 5`),
top-k sampling (`--sampling --beam 1 --sampling-topk 10`), etc.:
```bash
mkdir backtranslation_output
for SHARD in $(seq -f "%02g" 0 24); do \
fairseq-generate --fp16 \
data-bin/wmt18_de_mono/shard${SHARD} \
--path $CHECKPOINT_DIR/checkpoint_best.pt \
--skip-invalid-size-inputs-valid-test \
--max-tokens 4096 \
--sampling --beam 1 \
> backtranslation_output/sampling.shard${SHARD}.out; \
done
```
After BT, use the `extract_bt_data.py` script to re-combine the shards, extract
the back-translations and apply length ratio filters:
```bash
python examples/backtranslation/extract_bt_data.py \
--minlen 1 --maxlen 250 --ratio 1.5 \
--output backtranslation_output/bt_data --srclang en --tgtlang de \
backtranslation_output/sampling.shard*.out
# Ensure lengths are the same:
# wc -l backtranslation_output/bt_data.{en,de}
# 21795614 backtranslation_output/bt_data.en
# 21795614 backtranslation_output/bt_data.de
# 43591228 total
```
Binarize the filtered BT data and combine it with the parallel data:
```bash
TEXT=backtranslation_output
fairseq-preprocess \
--source-lang en --target-lang de \
--joined-dictionary \
--srcdict data-bin/wmt18_en_de/dict.en.txt \
--trainpref $TEXT/bt_data \
--destdir data-bin/wmt18_en_de_bt \
--workers 20
# We want to train on the combined data, so we'll symlink the parallel + BT data
# in the wmt18_en_de_para_plus_bt directory. We link the parallel data as "train"
# and the BT data as "train1", so that fairseq will combine them automatically
# and so that we can use the `--upsample-primary` option to upsample the
# parallel data (if desired).
PARA_DATA=$(readlink -f data-bin/wmt18_en_de)
BT_DATA=$(readlink -f data-bin/wmt18_en_de_bt)
COMB_DATA=data-bin/wmt18_en_de_para_plus_bt
mkdir -p $COMB_DATA
for LANG in en de; do \
ln -s ${PARA_DATA}/dict.$LANG.txt ${COMB_DATA}/dict.$LANG.txt; \
for EXT in bin idx; do \
ln -s ${PARA_DATA}/train.en-de.$LANG.$EXT ${COMB_DATA}/train.en-de.$LANG.$EXT; \
ln -s ${BT_DATA}/train.en-de.$LANG.$EXT ${COMB_DATA}/train1.en-de.$LANG.$EXT; \
ln -s ${PARA_DATA}/valid.en-de.$LANG.$EXT ${COMB_DATA}/valid.en-de.$LANG.$EXT; \
ln -s ${PARA_DATA}/test.en-de.$LANG.$EXT ${COMB_DATA}/test.en-de.$LANG.$EXT; \
done; \
done
```
#### 3. Train an English-German model over the combined parallel + BT data
Finally we can train a model over the parallel + BT data:
```bash
CHECKPOINT_DIR=checkpoints_en_de_parallel_plus_bt
fairseq-train --fp16 \
data-bin/wmt18_en_de_para_plus_bt \
--upsample-primary 16 \
--source-lang en --target-lang de \
--arch transformer_wmt_en_de_big --share-all-embeddings \
--dropout 0.3 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr 0.0007 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
--max-tokens 3584 --update-freq 16 \
--max-update 100000 \
--save-dir $CHECKPOINT_DIR
# Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a
# different number of GPUs.
```
Average the last 10 checkpoints:
```bash
python scripts/average_checkpoints.py \
--inputs $CHECKPOINT_DIR \
--num-epoch-checkpoints 10 \
--output $CHECKPOINT_DIR/checkpoint.avg10.pt
```
Evaluate BLEU:
```bash
# tokenized BLEU on newstest2017:
bash examples/backtranslation/tokenized_bleu.sh \
wmt17 \
en-de \
data-bin/wmt18_en_de \
data-bin/wmt18_en_de/code \
$CHECKPOINT_DIR/checkpoint.avg10.pt
# BLEU4 = 32.35, 64.4/38.9/26.2/18.3 (BP=0.977, ratio=0.977, syslen=60729, reflen=62152)
# compare to 32.35 in Table 1, which is also for tokenized BLEU
# generally it's better to report (detokenized) sacrebleu:
bash examples/backtranslation/sacrebleu.sh \
wmt17 \
en-de \
data-bin/wmt18_en_de \
data-bin/wmt18_en_de/code \
$CHECKPOINT_DIR/checkpoint.avg10.pt
# BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 31.5 64.3/38.2/25.6/17.6 (BP = 0.971 ratio = 0.971 hyp_len = 59515 ref_len = 61287)
```
## Citation
```bibtex
@inproceedings{edunov2018backtranslation,
title = {Understanding Back-Translation at Scale},
author = {Edunov, Sergey and Ott, Myle and Auli, Michael and Grangier, David},
booktitle = {Conference of the Association for Computational Linguistics (ACL)},
year = 2018,
}
```
#!/usr/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fileinput
import hashlib
import sys
from multiprocessing import Pool
def get_hashes_and_lines(raw_line):
hash = hashlib.md5(raw_line).hexdigest()
return hash, raw_line
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--workers", type=int, default=10)
parser.add_argument("files", nargs="*", help="input files")
args = parser.parse_args()
seen = set()
with fileinput.input(args.files, mode="rb") as h:
pool = Pool(args.workers)
results = pool.imap_unordered(get_hashes_and_lines, h, 1000)
for i, (hash, raw_line) in enumerate(results):
if hash not in seen:
seen.add(hash)
sys.stdout.buffer.write(raw_line)
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fileinput
from tqdm import tqdm
def main():
parser = argparse.ArgumentParser(
description=(
"Extract back-translations from the stdout of fairseq-generate. "
"If there are multiply hypotheses for a source, we only keep the first one. "
)
)
parser.add_argument("--output", required=True, help="output prefix")
parser.add_argument(
"--srclang", required=True, help="source language (extracted from H-* lines)"
)
parser.add_argument(
"--tgtlang", required=True, help="target language (extracted from S-* lines)"
)
parser.add_argument("--minlen", type=int, help="min length filter")
parser.add_argument("--maxlen", type=int, help="max length filter")
parser.add_argument("--ratio", type=float, help="ratio filter")
parser.add_argument("files", nargs="*", help="input files")
args = parser.parse_args()
def validate(src, tgt):
srclen = len(src.split(" ")) if src != "" else 0
tgtlen = len(tgt.split(" ")) if tgt != "" else 0
if (
(args.minlen is not None and (srclen < args.minlen or tgtlen < args.minlen))
or (
args.maxlen is not None
and (srclen > args.maxlen or tgtlen > args.maxlen)
)
or (
args.ratio is not None
and (max(srclen, tgtlen) / float(min(srclen, tgtlen)) > args.ratio)
)
):
return False
return True
def safe_index(toks, index, default):
try:
return toks[index]
except IndexError:
return default
with open(args.output + "." + args.srclang, "w") as src_h, open(
args.output + "." + args.tgtlang, "w"
) as tgt_h:
for line in tqdm(fileinput.input(args.files)):
if line.startswith("S-"):
tgt = safe_index(line.rstrip().split("\t"), 1, "")
elif line.startswith("H-"):
if tgt is not None:
src = safe_index(line.rstrip().split("\t"), 2, "")
if validate(src, tgt):
print(src, file=src_h)
print(tgt, file=tgt_h)
tgt = None
if __name__ == "__main__":
main()
#!/bin/bash
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
BPEROOT=subword-nmt/subword_nmt
BPE_CODE=wmt18_en_de/code
SUBSAMPLE_SIZE=25000000
LANG=de
OUTDIR=wmt18_${LANG}_mono
orig=orig
tmp=$OUTDIR/tmp
mkdir -p $OUTDIR $tmp
URLS=(
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2007.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2008.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2009.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2011.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz"
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz"
"http://www.statmt.org/wmt15/training-monolingual-news-crawl-v2/news.2014.de.shuffled.v2.gz"
"http://data.statmt.org/wmt16/translation-task/news.2015.de.shuffled.gz"
"http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz"
"http://data.statmt.org/wmt18/translation-task/news.2017.de.shuffled.deduped.gz"
)
FILES=(
"news.2007.de.shuffled.gz"
"news.2008.de.shuffled.gz"
"news.2009.de.shuffled.gz"
"news.2010.de.shuffled.gz"
"news.2011.de.shuffled.gz"
"news.2012.de.shuffled.gz"
"news.2013.de.shuffled.gz"
"news.2014.de.shuffled.v2.gz"
"news.2015.de.shuffled.gz"
"news.2016.de.shuffled.gz"
"news.2017.de.shuffled.deduped.gz"
)
cd $orig
for ((i=0;i<${#URLS[@]};++i)); do
file=${FILES[i]}
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
url=${URLS[i]}
wget "$url"
fi
done
cd ..
if [ -f $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then
echo "found monolingual sample, skipping shuffle/sample/tokenize"
else
gzip -c -d -k $(for FILE in "${FILES[@]}"; do echo $orig/$FILE; done) \
| shuf -n $SUBSAMPLE_SIZE \
| perl $NORM_PUNC $LANG \
| perl $REM_NON_PRINT_CHAR \
| perl $TOKENIZER -threads 8 -a -l $LANG \
> $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG}
fi
if [ -f $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then
echo "found BPE monolingual sample, skipping BPE step"
else
python $BPEROOT/apply_bpe.py -c $BPE_CODE \
< $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} \
> $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG}
fi
if [ -f $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} ]; then
echo "found deduplicated monolingual sample, skipping deduplication step"
else
python deduplicate_lines.py $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} \
> $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG}
fi
if [ -f $OUTDIR/bpe.monolingual.dedup.00.de ]; then
echo "found sharded data, skipping sharding step"
else
split --lines 1000000 --numeric-suffixes \
--additional-suffix .${LANG} \
$tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} \
$OUTDIR/bpe.monolingual.dedup.
fi
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
BPEROOT=subword-nmt/subword_nmt
BPE_TOKENS=32000
URLS=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
"http://statmt.org/wmt14/test-full.tgz"
)
FILES=(
"training-parallel-europarl-v7.tgz"
"training-parallel-commoncrawl.tgz"
"training-parallel-nc-v13.tgz"
"rapid2016.tgz"
"dev.tgz"
"test-full.tgz"
)
CORPORA=(
"training/europarl-v7.de-en"
"commoncrawl.de-en"
"training-parallel-nc-v13/news-commentary-v13.de-en"
"rapid2016.de-en"
)
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit 1
fi
OUTDIR=wmt18_en_de
src=en
tgt=de
lang=en-de
prep=$OUTDIR
tmp=$prep/tmp
orig=orig
mkdir -p $orig $tmp $prep
cd $orig
for ((i=0;i<${#URLS[@]};++i)); do
file=${FILES[i]}
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
url=${URLS[i]}
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit 1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
fi
fi
done
cd ..
echo "pre-processing train data..."
for l in $src $tgt; do
rm $tmp/train.tags.$lang.tok.$l
for f in "${CORPORA[@]}"; do
cat $orig/$f.$l | \
perl $NORM_PUNC $l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
done
done
echo "pre-processing test data..."
for l in $src $tgt; do
if [ "$l" == "$src" ]; then
t="src"
else
t="ref"
fi
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
sed -e 's/<seg id="[0-9]*">\s*//g' | \
sed -e 's/\s*<\/seg>\s*//g' | \
sed -e "s/\’/\'/g" | \
perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
echo ""
done
echo "splitting train and valid..."
for l in $src $tgt; do
awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
done
TRAIN=$tmp/train.de-en
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
cat $tmp/train.$l >> $TRAIN
done
echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
for L in $src $tgt; do
for f in train.$L valid.$L test.$L; do
echo "apply_bpe.py to ${f}..."
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
done
done
perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
for L in $src $tgt; do
cp $tmp/bpe.test.$L $prep/test.$L
done
#!/bin/bash
if [ $# -ne 5 ]; then
echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
exit
fi
DATASET=$1
LANGPAIR=$2
DATABIN=$3
BPECODE=$4
MODEL=$5
SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
if [ ! -e $BPEROOT ]; then
BPEROOT=subword-nmt/subword_nmt
if [ ! -e $BPEROOT ]; then
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
fi
fi
sacrebleu -t $DATASET -l $LANGPAIR --echo src \
| sacremoses tokenize -a -l $SRCLANG -q \
| python $BPEROOT/apply_bpe.py -c $BPECODE \
| fairseq-interactive $DATABIN --path $MODEL \
-s $SRCLANG -t $TGTLANG \
--beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
| grep ^H- | cut -f 3- \
| sacremoses detokenize -l $TGTLANG -q \
| sacrebleu -t $DATASET -l $LANGPAIR
#!/bin/bash
if [ $# -ne 5 ]; then
echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
exit
fi
DATASET=$1
LANGPAIR=$2
DATABIN=$3
BPECODE=$4
MODEL=$5
SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
if [ ! -e $BPEROOT ]; then
BPEROOT=subword-nmt/subword_nmt
if [ ! -e $BPEROOT ]; then
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
fi
fi
TMP_REF=$(mktemp)
sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \
| sacremoses normalize -l $TGTLANG -q \
| sacremoses tokenize -a -l $TGTLANG -q \
> $TMP_REF
sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \
| sacremoses normalize -l $SRCLANG -q \
| sacremoses tokenize -a -l $SRCLANG -q \
| python $BPEROOT/apply_bpe.py -c $BPECODE \
| fairseq-interactive $DATABIN --path $MODEL \
-s $SRCLANG -t $TGTLANG \
--beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
| grep ^H- | cut -f 3- \
| fairseq-score --ref $TMP_REF
rm -f $TMP_REF
# Fine-tuning BART on GLUE tasks
### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
```bash
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
python download_glue_data.py --data_dir glue_data --tasks all
```
### 2) Preprocess GLUE task data (same as RoBERTa):
```bash
./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
```
`glue_task_name` is one of the following:
`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
Use `ALL` for preprocessing all the glue tasks.
### 3) Fine-tuning on GLUE task:
Example fine-tuning cmd for `RTE` task
```bash
TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16
WARMUP_UPDATES=61 # 6 percent of the number of updates
LR=1e-05 # Peak LR for polynomial LR scheduler.
NUM_CLASSES=2
MAX_SENTENCES=16 # Batch size.
BART_PATH=/path/to/bart/model.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \
--restore-file $BART_PATH \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--add-prev-output-tokens \
--layernorm-embedding \
--share-all-embeddings \
--share-decoder-input-output-embed \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 \
--arch bart_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
```
For each of the GLUE task, you will need to use following cmd-line arguments:
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
`--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5
`bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32
`--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799
`--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107
For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
**Note:**
a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task.
b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
### Inference on GLUE task
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
```python
from fairseq.models.bart import BARTModel
bart = BARTModel.from_pretrained(
'checkpoints/',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='RTE-bin'
)
label_fn = lambda label: bart.task.label_dictionary.string(
[label + bart.task.label_dictionary.nspecial]
)
ncorrect, nsamples = 0, 0
bart.cuda()
bart.eval()
with open('glue_data/RTE/dev.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[1], tokens[2], tokens[3]
tokens = bart.encode(sent1, sent2)
prediction = bart.predict('sentence_classification_head', tokens).argmax().item()
prediction_label = label_fn(prediction)
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
```
# BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
[https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461)
## Introduction
BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details.
## Pre-trained models
Model | Description | # params | Download
---|---|---|---
`bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz)
`bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz)
`bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz)
`bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz)
`bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz)
## Results
**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
_(dev set, single model, single-task finetuning)_
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
`bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2
**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
_(dev set, no additional data used)_
Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
---|---|---
`roberta.large` | 88.9/94.6 | 86.5/89.4
`bart.large` | 88.8/94.6 | 86.1/89.2
**[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)**
_(test set, no additional data used)_
Model | R1 | R2 | RL
---|---|---|---
`BERTSUMEXTABS` | 42.13 | 19.60 | 39.18
`bart.large` | 44.16 | 21.28 | 40.90
## Example usage
##### Load BART from torch.hub (PyTorch >= 1.1):
```python
import torch
bart = torch.hub.load('pytorch/fairseq', 'bart.large')
bart.eval() # disable dropout (or leave in train mode to finetune)
```
##### Load BART (for PyTorch 1.0 or custom models):
```python
# Download bart.large model
wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz
tar -xzvf bart.large.tar.gz
# Load the model in fairseq
from fairseq.models.bart import BARTModel
bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt')
bart.eval() # disable dropout (or leave in train mode to finetune)
```
##### Apply Byte-Pair Encoding (BPE) to input text:
```python
tokens = bart.encode('Hello world!')
assert tokens.tolist() == [0, 31414, 232, 328, 2]
bart.decode(tokens) # 'Hello world!'
```
##### Extract features from BART:
```python
# Extract the last layer's features
last_layer_features = bart.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 5, 1024])
# Extract all layer's features from decoder (layer 0 is the embedding layer)
all_layers = bart.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 13
assert torch.all(all_layers[-1] == last_layer_features)
```
##### Use BART for sentence-pair classification tasks:
```python
# Download BART already finetuned for MNLI
bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
bart.eval() # disable dropout for evaluation
# Encode a pair of sentences and make a prediction
tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.')
bart.predict('mnli', tokens).argmax() # 0: contradiction
# Encode another pair of sentences
tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.')
bart.predict('mnli', tokens).argmax() # 2: entailment
```
##### Register a new (randomly initialized) classification head:
```python
bart.register_classification_head('new_task', num_classes=3)
logprobs = bart.predict('new_task', tokens)
```
##### Batched prediction:
```python
import torch
from fairseq.data.data_utils import collate_tokens
bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
bart.eval()
batch_of_pairs = [
['BART is a seq2seq model.', 'BART is not sequence to sequence.'],
['BART is denoising autoencoder.', 'BART is version of autoencoder.'],
]
batch = collate_tokens(
[bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)
logprobs = bart.predict('mnli', batch)
print(logprobs.argmax(dim=1))
# tensor([0, 2])
```
##### Using the GPU:
```python
bart.cuda()
bart.predict('new_task', tokens)
```
#### Filling masks:
BART can be used to fill multiple `<mask>` tokens in the input.
```python
bart = torch.hub.load('pytorch/fairseq', 'bart.base')
bart.eval()
bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10)
# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]]
```
Note that by default we enforce the output length to match the input length.
This can be disabled by setting ``match_source_len=False``:
```
bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10, match_source_len=False)
# [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]]
```
Example code to fill masks for a batch of sentences using GPU
```
bart.cuda()
bart.fill_mask(['The cat <mask> on the <mask>.', 'The dog <mask> on the <mask>.'], topk=3, beam=10)
# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)),
('The dog was asleep on the couch', tensor(-0.6796))]]
```
#### Evaluating the `bart.large.mnli` model:
Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
```python
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
bart.cuda()
bart.eval()
with open('glue_data/MNLI/dev_matched.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
tokens = bart.encode(sent1, sent2)
prediction = bart.predict('mnli', tokens).argmax().item()
prediction_label = label_map[prediction]
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Expected output: 0.9010
```
#### Evaluating the `bart.large.cnn` model:
- Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample.
- For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores
- `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search.
In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`.
In `fairseq`, summaries can be generated using:
```bash
cp data-bin/cnn_dm/dict.source.txt checkpoints/
python examples/bart/summarize.py \
--model-dir pytorch/fairseq \
--model-file bart.large.cnn \
--src cnn_dm/test.source \
--out cnn_dm/test.hypo
```
For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge).
```bash
export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
# Tokenize hypothesis and target files.
cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized
cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target
files2rouge test.hypo.tokenized test.hypo.target
# Expected output: (ROUGE-2 Average_F: 0.21238)
```
## Finetuning
- [Finetuning on GLUE](README.glue.md)
- [Finetuning on CNN-DM](README.summarization.md)
## Citation
```bibtex
@article{lewis2019bart,
title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural
Language Generation, Translation, and Comprehension},
author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and
Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov
and Luke Zettlemoyer },
journal={arXiv preprint arXiv:1910.13461},
year = {2019},
}
```
# Fine-tuning BART on CNN-Dailymail summarization task
### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples.
Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail).
Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset.
### 2) BPE preprocess:
```bash
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
TASK=cnn_dm
for SPLIT in train val
do
for LANG in source target
do
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$TASK/$SPLIT.$LANG" \
--outputs "$TASK/$SPLIT.bpe.$LANG" \
--workers 60 \
--keep-empty;
done
done
```
### 3) Binarize dataset:
```bash
fairseq-preprocess \
--source-lang "source" \
--target-lang "target" \
--trainpref "${TASK}/train.bpe" \
--validpref "${TASK}/val.bpe" \
--destdir "${TASK}-bin/" \
--workers 60 \
--srcdict dict.txt \
--tgtdict dict.txt;
```
### 4) Fine-tuning on CNN-DM summarization task:
Example fine-tuning CNN-DM
```bash
TOTAL_NUM_UPDATES=20000
WARMUP_UPDATES=500
LR=3e-05
MAX_TOKENS=2048
UPDATE_FREQ=4
BART_PATH=/path/to/bart/model.pt
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \
--restore-file $BART_PATH \
--max-tokens $MAX_TOKENS \
--task translation \
--source-lang source --target-lang target \
--truncate-source \
--layernorm-embedding \
--share-all-embeddings \
--share-decoder-input-output-embed \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--arch bart_large \
--criterion label_smoothed_cross_entropy \
--label-smoothing 0.1 \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
--clip-norm 0.1 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --update-freq $UPDATE_FREQ \
--skip-invalid-size-inputs-valid-test \
--find-unused-parameters;
```
Above is expected to run on `1` node with `8 32gb-V100`.
Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`.
Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task
### Inference for CNN-DM test data using above trained checkpoint.
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example
```bash
cp data-bin/cnn_dm/dict.source.txt checkpoints/
python examples/bart/summarize.py \
--model-dir checkpoints \
--model-file checkpoint_best.pt \
--src cnn_dm/test.source \
--out cnn_dm/test.hypo
```
For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10:
```bash
cp data-bin/cnn_dm/dict.source.txt checkpoints/
python examples/bart/summarize.py \
--model-dir checkpoints \
--model-file checkpoint_best.pt \
--src cnn_dm/test.source \
--out cnn_dm/test.hypo \
--xsum-kwargs
```
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from fairseq.models.bart import BARTModel
import argparse
XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3)
CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3)
@torch.no_grad()
def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs):
count = 1
# if n_obs is not None: bsz = min(bsz, n_obs)
with open(infile) as source, open(outfile, "w") as fout:
sline = source.readline().strip()
slines = [sline]
for sline in source:
if n_obs is not None and count > n_obs:
break
if count % bsz == 0:
hypotheses_batch = bart.sample(slines, **eval_kwargs)
for hypothesis in hypotheses_batch:
fout.write(hypothesis + "\n")
fout.flush()
slines = []
slines.append(sline.strip())
count += 1
if slines != []:
hypotheses_batch = bart.sample(slines, **eval_kwargs)
for hypothesis in hypotheses_batch:
fout.write(hypothesis + "\n")
fout.flush()
def main():
"""
Usage::
python examples/bart/summarize.py \
--model-dir $HOME/bart.large.cnn \
--model-file model.pt \
--src $HOME/data-bin/cnn_dm/test.source
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-dir",
required=True,
type=str,
default="bart.large.cnn/",
help="path containing model file and src_dict.txt",
)
parser.add_argument(
"--model-file",
default="checkpoint_best.pt",
help="where in model_dir are weights saved",
)
parser.add_argument(
"--src", default="test.source", help="text to summarize", type=str
)
parser.add_argument(
"--out", default="test.hypo", help="where to save summaries", type=str
)
parser.add_argument("--bsz", default=32, help="where to save summaries", type=int)
parser.add_argument(
"--n", default=None, help="how many examples to summarize", type=int
)
parser.add_argument(
"--xsum-kwargs",
action="store_true",
default=False,
help="if true use XSUM_KWARGS else CNN_KWARGS",
)
args = parser.parse_args()
eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS
if args.model_dir == "pytorch/fairseq":
bart = torch.hub.load("pytorch/fairseq", args.model_file)
else:
bart = BARTModel.from_pretrained(
args.model_dir,
checkpoint_file=args.model_file,
data_name_or_path=args.model_dir,
)
bart = bart.eval()
if torch.cuda.is_available():
bart = bart.cuda().half()
generate(
bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs
)
if __name__ == "__main__":
main()
# Neural Machine Translation with Byte-Level Subwords
https://arxiv.org/abs/1909.03341
We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as
example.
## Data
Get data and generate fairseq binary dataset:
```bash
bash ./get_data.sh
```
## Model Training
Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`):
```bash
# VOCAB=bytes
# VOCAB=chars
VOCAB=bbpe2048
# VOCAB=bpe2048
# VOCAB=bbpe4096
# VOCAB=bpe4096
# VOCAB=bpe16384
```
```bash
fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
--arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' \
--lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \
--batch-size 100 --max-update 100000 --update-freq 2
```
## Generation
`fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters:
```bash
# BPE=--bpe bytes
# BPE=--bpe characters
BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model
# BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model
```
```bash
fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
--source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \
--tokenizer moses --moses-target-lang en ${BPE}
```
When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions:
```bash
fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
--path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \
--moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000
```
## Results
| Vocabulary | Model | BLEU |
|:-------------:|:-------------:|:-------------:|
| Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 |
| Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) |
| Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) |
| Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) |
| Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) |
| Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) |
| Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) |
| Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) |
## Citation
```
@misc{wang2019neural,
title={Neural Machine Translation with Byte-Level Subwords},
author={Changhan Wang and Kyunghyun Cho and Jiatao Gu},
year={2019},
eprint={1909.03341},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
## Contact
Changhan Wang ([changhan@fb.com](mailto:changhan@fb.com)),
Kyunghyun Cho ([kyunghyuncho@fb.com](mailto:kyunghyuncho@fb.com)),
Jiatao Gu ([jgu@fb.com](mailto:jgu@fb.com))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import os
import os.path as op
from collections import namedtuple
from multiprocessing import cpu_count
from typing import List, Optional
import sentencepiece as sp
from fairseq.data.encoders.byte_bpe import ByteBPE
from fairseq.data.encoders.byte_utils import byte_encode
from fairseq.data.encoders.bytes import Bytes
from fairseq.data.encoders.characters import Characters
from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE
SPLITS = ["train", "valid", "test"]
def _convert_xml(in_path: str, out_path: str):
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
ss = s.strip()
if not ss.startswith("<seg"):
continue
ss = ss.replace("</seg>", "").split('">')
assert len(ss) == 2
f_o.write(ss[1].strip() + "\n")
def _convert_train(in_path: str, out_path: str):
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
ss = s.strip()
if ss.startswith("<"):
continue
f_o.write(ss.strip() + "\n")
def _get_bytes(in_path: str, out_path: str):
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
f_o.write(Bytes.encode(s.strip()) + "\n")
def _get_chars(in_path: str, out_path: str):
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
f_o.write(Characters.encode(s.strip()) + "\n")
def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
Args = namedtuple(
"Args",
[
"moses_source_lang",
"moses_target_lang",
"moses_no_dash_splits",
"moses_no_escape",
],
)
args = Args(
moses_source_lang=src,
moses_target_lang=tgt,
moses_no_dash_splits=False,
moses_no_escape=False,
)
pretokenizer = MosesTokenizer(args)
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
f_o.write(pretokenizer.encode(s.strip()) + "\n")
def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str):
with open(out_path, "w") as f_o:
for lang in [src, tgt]:
with open(f"{in_path_prefix}.{lang}") as f:
for s in f:
f_o.write(byte_encode(s.strip()) + "\n")
def _get_bpe(in_path: str, model_prefix: str, vocab_size: int):
arguments = [
f"--input={in_path}",
f"--model_prefix={model_prefix}",
f"--model_type=bpe",
f"--vocab_size={vocab_size}",
"--character_coverage=1.0",
"--normalization_rule_name=identity",
f"--num_threads={cpu_count()}",
]
sp.SentencePieceTrainer.Train(" ".join(arguments))
def _apply_bbpe(model_path: str, in_path: str, out_path: str):
Args = namedtuple("Args", ["sentencepiece_model_path"])
args = Args(sentencepiece_model_path=model_path)
tokenizer = ByteBPE(args)
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
f_o.write(tokenizer.encode(s.strip()) + "\n")
def _apply_bpe(model_path: str, in_path: str, out_path: str):
Args = namedtuple("Args", ["sentencepiece_model"])
args = Args(sentencepiece_model=model_path)
tokenizer = SentencepieceBPE(args)
with open(in_path) as f, open(out_path, "w") as f_o:
for s in f:
f_o.write(tokenizer.encode(s.strip()) + "\n")
def _concat_files(in_paths: List[str], out_path: str):
with open(out_path, "w") as f_o:
for p in in_paths:
with open(p) as f:
for r in f:
f_o.write(r)
def preprocess_iwslt17(
root: str,
src: str,
tgt: str,
bpe_size: Optional[int],
need_chars: bool,
bbpe_size: Optional[int],
need_bytes: bool,
):
# extract bitext
in_root = op.join(root, f"{src}-{tgt}")
for lang in [src, tgt]:
_convert_train(
op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"),
op.join(root, f"train.{lang}"),
)
_convert_xml(
op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"),
op.join(root, f"valid.{lang}"),
)
_convert_xml(
op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"),
op.join(root, f"test.{lang}"),
)
# pre-tokenize
for lang in [src, tgt]:
for split in SPLITS:
pretokenize(
op.join(root, f"{split}.{lang}"),
op.join(root, f"{split}.moses.{lang}"),
src,
tgt,
)
# tokenize with BPE vocabulary
if bpe_size is not None:
# learn vocabulary
concated_train_path = op.join(root, "train.all")
_concat_files(
[op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")],
concated_train_path,
)
bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}")
_get_bpe(concated_train_path, bpe_model_prefix, bpe_size)
os.remove(concated_train_path)
# apply
for lang in [src, tgt]:
for split in SPLITS:
_apply_bpe(
bpe_model_prefix + ".model",
op.join(root, f"{split}.moses.{lang}"),
op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"),
)
# tokenize with bytes vocabulary
if need_bytes:
for lang in [src, tgt]:
for split in SPLITS:
_get_bytes(
op.join(root, f"{split}.moses.{lang}"),
op.join(root, f"{split}.moses.bytes.{lang}"),
)
# tokenize with characters vocabulary
if need_chars:
for lang in [src, tgt]:
for split in SPLITS:
_get_chars(
op.join(root, f"{split}.moses.{lang}"),
op.join(root, f"{split}.moses.chars.{lang}"),
)
# tokenize with byte-level BPE vocabulary
if bbpe_size is not None:
# learn vocabulary
bchar_path = op.join(root, "train.bchar")
_convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path)
bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}")
_get_bpe(bchar_path, bbpe_model_prefix, bbpe_size)
os.remove(bchar_path)
# apply
for lang in [src, tgt]:
for split in SPLITS:
_apply_bbpe(
bbpe_model_prefix + ".model",
op.join(root, f"{split}.moses.{lang}"),
op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"),
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--root", type=str, default="data")
parser.add_argument(
"--bpe-vocab",
default=None,
type=int,
help="Generate tokenized bitext with BPE of size K."
"Default to None (disabled).",
)
parser.add_argument(
"--bbpe-vocab",
default=None,
type=int,
help="Generate tokenized bitext with BBPE of size K."
"Default to None (disabled).",
)
parser.add_argument(
"--byte-vocab",
action="store_true",
help="Generate tokenized bitext with bytes vocabulary",
)
parser.add_argument(
"--char-vocab",
action="store_true",
help="Generate tokenized bitext with chars vocabulary",
)
args = parser.parse_args()
preprocess_iwslt17(
args.root,
"fr",
"en",
args.bpe_vocab,
args.char_vocab,
args.bbpe_vocab,
args.byte_vocab,
)
if __name__ == "__main__":
main()
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
PY_BIN_ROOT=
# PyPI dependency
${PY_BIN_ROOT}pip install sentencepiece sacremoses
# Get data
if [ ! -d "data" ]; then
mkdir data
fi
if [ ! -f "data/fr-en.tgz" ]; then
wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data
tar xvf data/fr-en.tgz -C data
fi
${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab
for VOCAB_SIZE in 2048 4096; do
${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE}
done
rm -r data/fr-en data/fr-en.tgz
# Generate binary dataset
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \
--workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \
--testpref data/test.moses.bpe16384
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \
--workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \
--testpref data/test.moses.bytes
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \
--workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \
--testpref data/test.moses.chars
for VOCAB_SIZE in 2048 4096; do
for TYPE in bbpe bpe; do
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \
--joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \
--validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}"
done
done
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch.nn as nn
import torch.nn.functional as F
from fairseq.models import register_model, register_model_architecture
from fairseq.models.transformer import TransformerEncoder, TransformerModel
@register_model("gru_transformer")
class GRUTransformerModel(TransformerModel):
@classmethod
def build_encoder(cls, args, src_dict, embed_tokens):
return GRUTransformerEncoder(args, src_dict, embed_tokens)
class GRUTransformerEncoder(TransformerEncoder):
def __init__(self, args, dictionary, embed_tokens):
super().__init__(args, dictionary, embed_tokens)
self.emb_ctx = nn.GRU(
input_size=embed_tokens.embedding_dim,
hidden_size=embed_tokens.embedding_dim // 2,
num_layers=1,
bidirectional=True,
)
def forward_embedding(self, src_tokens):
# embed tokens and positions
x = embed = self.embed_scale * self.embed_tokens(src_tokens)
if self.embed_positions is not None:
x = embed + self.embed_positions(src_tokens)
# contextualize embeddings
x = x.transpose(0, 1)
x = self.dropout_module(x)
x, _ = self.emb_ctx.forward(x)
x = x.transpose(0, 1)
if self.layernorm_embedding is not None:
x = self.layernorm_embedding(x)
x = self.dropout_module(x)
return x, embed
@register_model_architecture("gru_transformer", "gru_transformer")
def gru_transformer_base_architecture(args):
args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
args.encoder_layers = getattr(args, "encoder_layers", 6)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
args.decoder_ffn_embed_dim = getattr(
args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
)
args.decoder_layers = getattr(args, "decoder_layers", 6)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
args.attention_dropout = getattr(args, "attention_dropout", 0.0)
args.activation_dropout = getattr(args, "activation_dropout", 0.0)
args.activation_fn = getattr(args, "activation_fn", "relu")
args.dropout = getattr(args, "dropout", 0.1)
args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
args.share_decoder_input_output_embed = getattr(
args, "share_decoder_input_output_embed", False
)
args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
args.no_token_positional_embeddings = getattr(
args, "no_token_positional_embeddings", False
)
args.adaptive_input = getattr(args, "adaptive_input", False)
args.no_cross_attention = getattr(args, "no_cross_attention", False)
args.cross_self_attention = getattr(args, "cross_self_attention", False)
args.layer_wise_attention = getattr(args, "layer_wise_attention", False)
args.decoder_output_dim = getattr(
args, "decoder_output_dim", args.decoder_embed_dim
)
args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
@register_model_architecture("gru_transformer", "gru_transformer_big")
def gru_transformer_big(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
args.dropout = getattr(args, "dropout", 0.3)
gru_transformer_base_architecture(args)
# CamemBERT: a Tasty French Language Model
## Introduction
[CamemBERT](https://arxiv.org/abs/1911.03894) is a pretrained language model trained on 138GB of French text based on RoBERTa.
Also available in [github.com/huggingface/transformers](https://github.com/huggingface/transformers/).
## Pre-trained models
| Model | #params | Download | Arch. | Training data |
|--------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------|-------|-----------------------------------|
| `camembert` / `camembert-base` | 110M | [camembert-base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz) | Base | OSCAR (138 GB of text) |
| `camembert-large` | 335M | [camembert-large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz) | Large | CCNet (135 GB of text) |
| `camembert-base-ccnet` | 110M | [camembert-base-ccnet.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz) | Base | CCNet (135 GB of text) |
| `camembert-base-wikipedia-4gb` | 110M | [camembert-base-wikipedia-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz) | Base | Wikipedia (4 GB of text) |
| `camembert-base-oscar-4gb` | 110M | [camembert-base-oscar-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz) | Base | Subsample of OSCAR (4 GB of text) |
| `camembert-base-ccnet-4gb` | 110M | [camembert-base-ccnet-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz) | Base | Subsample of CCNet (4 GB of text) |
## Example usage
### fairseq
##### Load CamemBERT from torch.hub (PyTorch >= 1.1):
```python
import torch
camembert = torch.hub.load('pytorch/fairseq', 'camembert')
camembert.eval() # disable dropout (or leave in train mode to finetune)
```
##### Load CamemBERT (for PyTorch 1.0 or custom models):
```python
# Download camembert model
wget https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz
tar -xzvf camembert.tar.gz
# Load the model in fairseq
from fairseq.models.roberta import CamembertModel
camembert = CamembertModel.from_pretrained('/path/to/camembert')
camembert.eval() # disable dropout (or leave in train mode to finetune)
```
##### Filling masks:
```python
masked_line = 'Le camembert est <mask> :)'
camembert.fill_mask(masked_line, topk=3)
# [('Le camembert est délicieux :)', 0.4909118115901947, ' délicieux'),
# ('Le camembert est excellent :)', 0.10556942224502563, ' excellent'),
# ('Le camembert est succulent :)', 0.03453322499990463, ' succulent')]
```
##### Extract features from Camembert:
```python
# Extract the last layer's features
line = "J'aime le camembert !"
tokens = camembert.encode(line)
last_layer_features = camembert.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 10, 768])
# Extract all layer's features (layer 0 is the embedding layer)
all_layers = camembert.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 13
assert torch.all(all_layers[-1] == last_layer_features)
```
## Citation
If you use our work, please cite:
```bibtex
@inproceedings{martin2020camembert,
title={CamemBERT: a Tasty French Language Model},
author={Martin, Louis and Muller, Benjamin and Su{\'a}rez, Pedro Javier Ortiz and Dupont, Yoann and Romary, Laurent and de la Clergerie, {\'E}ric Villemonte and Seddah, Djam{\'e} and Sagot, Beno{\^\i}t},
booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
year={2020}
}
```
# (Vectorized) Lexically constrained decoding with dynamic beam allocation
This page provides instructions for how to use lexically constrained decoding in Fairseq.
Fairseq implements the code described in the following papers:
* [Fast Lexically Constrained Decoding With Dynamic Beam Allocation](https://www.aclweb.org/anthology/N18-1119/) (Post & Vilar, 2018)
* [Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting](https://www.aclweb.org/anthology/N19-1090/) (Hu et al., 2019)
## Quick start
Constrained search is enabled by adding the command-line argument `--constraints` to `fairseq-interactive`.
Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens)
is a separate field.
The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/main/examples/wmt19/README.md),
translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints
"hard" and "to influence".
echo -e "Die maschinelle Übersetzung ist schwer zu kontrollieren.\thard\ttoinfluence" \
| normalize.py | tok.py \
| fairseq-interactive /path/to/model \
--path /path/to/model/model1.pt \
--bpe fastbpe \
--bpe-codes /path/to/model/bpecodes \
--constraints \
-s de -t en \
--beam 10
(tok.py and normalize.py can be found in the same directory as this README; they are just shortcuts around Fairseq's WMT19 preprocessing).
This will generate the following output:
[snip]
S-0 Die masch@@ in@@ elle Über@@ setzung ist schwer zu kontrollieren .
W-0 1.844 seconds
C-0 hard
C-0 influence
H-0 -1.5333266258239746 Mach@@ ine trans@@ lation is hard to influence .
D-0 -1.5333266258239746 Machine translation is hard to influence .
P-0 -0.5434 -0.1423 -0.1930 -0.1415 -0.2346 -1.8031 -0.1701 -11.7727 -0.1815 -0.1511
By default, constraints are generated in the order supplied, with any number (zero or more) of tokens generated
between constraints. If you wish for the decoder to order the constraints, then use `--constraints unordered`.
Note that you may want to use a larger beam.
## Implementation details
The heart of the implementation is in `fairseq/search.py`, which adds a `LexicallyConstrainedBeamSearch` instance.
This instance of beam search tracks the progress of each hypothesis in the beam through the set of constraints
provided for each input sentence. It does this using one of two classes, both found in `fairseq/token_generation_contstraints.py`:
* OrderedConstraintState: assumes the `C` input constraints will be generated in the provided order
* UnorderedConstraintState: tries to apply `C` (phrasal) constraints in all `C!` orders
## Differences from Sockeye
There are a number of [differences from Sockeye's implementation](https://awslabs.github.io/sockeye/inference.html#lexical-constraints).
* Generating constraints in the order supplied (the default option here) is not available in Sockeye.
* Due to an improved beam allocation method, there is no need to prune the beam.
* Again due to better allocation, beam sizes as low as 10 or even 5 are often sufficient.
* [The vector extensions described in Hu et al.](https://github.com/edwardjhu/sockeye/tree/trie_constraints) (NAACL 2019) were never merged
into the main Sockeye branch.
## Citation
The paper first describing lexical constraints for seq2seq decoding is:
```bibtex
@inproceedings{hokamp-liu-2017-lexically,
title = "Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search",
author = "Hokamp, Chris and
Liu, Qun",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1141",
doi = "10.18653/v1/P17-1141",
pages = "1535--1546",
}
```
The fairseq implementation uses the extensions described in
```bibtex
@inproceedings{post-vilar-2018-fast,
title = "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation",
author = "Post, Matt and
Vilar, David",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1119",
doi = "10.18653/v1/N18-1119",
pages = "1314--1324",
}
```
and
```bibtex
@inproceedings{hu-etal-2019-improved,
title = "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting",
author = "Hu, J. Edward and
Khayrallah, Huda and
Culkin, Ryan and
Xia, Patrick and
Chen, Tongfei and
Post, Matt and
Van Durme, Benjamin",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N19-1090",
doi = "10.18653/v1/N19-1090",
pages = "839--850",
}
```
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
from sacremoses.normalize import MosesPunctNormalizer
def main(args):
normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn)
for line in sys.stdin:
print(normalizer.normalize(line.rstrip()), flush=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--lang", "-l", default="en")
parser.add_argument("--penn", "-p", action="store_true")
args = parser.parse_args()
main(args)
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
import sacremoses
def main(args):
"""Tokenizes, preserving tabs"""
mt = sacremoses.MosesTokenizer(lang=args.lang)
def tok(s):
return mt.tokenize(s, return_str=True)
for line in sys.stdin:
parts = list(map(tok, line.split("\t")))
print(*parts, sep="\t", flush=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--lang", "-l", default="en")
parser.add_argument("--penn", "-p", action="store_true")
parser.add_argument("--fields", "-f", help="fields to tokenize")
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment