Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
#!/bin/bash
set -e
sizes="base large xlarge"
declare -A ckpt_urls
ckpt_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt"
ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt"
ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt"
declare -A km_layers
km_layers[base]=9
km_layers[large]=20
km_layers[xlarge]=30
declare -A km_urls
km_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin"
declare -A km_nunits
km_nunits[base]=500
test_dir=./examples/hubert/tests
split=sample
echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv"
check_feature () {
echo "checking features..."
size=$1
ckpt_url=$2
km_layer=$3
ckpt_path="$test_dir/$(basename "$ckpt_url")"
if [ ! -f "$ckpt_path" ]; then
echo "downloading $ckpt_url to $ckpt_path"
wget "$ckpt_url" -O "$ckpt_path"
fi
python ./examples/hubert/simple_kmeans/dump_hubert_feature.py \
"${test_dir}" "${split}" "${ckpt_path}" "${km_layer}" 1 0 "${test_dir}"
if diff -q "${test_dir}/${split}.${size}.L${km_layer}.npy" "${test_dir}/${split}_0_1.npy" &>/dev/null; then
echo "...passed npy check"
else
echo "...failed npy check"
fi
if diff -q "${test_dir}/${split}.${size}.L${km_layer}.len" "${test_dir}/${split}_0_1.len" &>/dev/null; then
echo "...passed len check"
else
echo "...failed len check"
fi
}
check_unit () {
echo "checking units..."
size=$1
km_url=$2
km_layer=$3
km_nunit=$4
km_path="$test_dir/$(basename "$km_url")"
if [ ! -f "$km_path" ]; then
echo "downloading $km_url to $km_path"
wget "$km_url" -O "$km_path"
fi
python ./examples/hubert/simple_kmeans/dump_km_label.py \
"${test_dir}" "${split}" "${km_path}" 1 0 "${test_dir}"
if diff -q "${test_dir}/${split}.${size}.L${km_layer}.km${km_nunit}.km" "${test_dir}/${split}_0_1.km" &>/dev/null; then
echo "...passed unit check"
else
echo "...failed unit check"
fi
}
for size in $sizes; do
echo "=== Running unit test for HuBERT $size ==="
check_feature "$size" "${ckpt_urls[$size]}" "${km_layers[$size]}"
if [ -n "${km_urls[$size]}" ]; then
check_unit "$size" "${km_urls[$size]}" "${km_layers[$size]}" "${km_nunits[$size]}"
fi
rm -f $test_dir/${split}_0_1.*
done
#!/bin/bash
set -e
sizes="large xlarge"
declare -A ckpt_urls
ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt"
ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt"
test_dir=$(pwd)/examples/hubert/tests
split=sample
echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv"
echo -e "K E E P | A | G O I N G | A N D | I F | Y O U ' R E | L U C K Y | Y O U ' L L | R U N | P L U M B | I N T O | T H E M | W A S | T H E | J E E R I N G | A N S W E R | A S | T H E | S L E E P Y | C O W M E N | S P U R R E D | T H E I R | P O N I E S | O N | T O W A R D | C A M P | M U T T E R I N G | T H E I R | D I S A P P R O V A L | O F | T A K I N G | A L O N G | A | B U N C H | O F | B O Y S | O N | A | C A T T L E | D R I V E |" > "${test_dir}/${split}.ltr"
check_asr () {
echo "checking asr outputs..."
size=$1
ckpt_url=$2
ckpt_path="$test_dir/$(basename "$ckpt_url")"
if [ ! -f "$ckpt_path" ]; then
echo "downloading $ckpt_url to $ckpt_path"
wget "$ckpt_url" -O "$ckpt_path"
fi
python examples/speech_recognition/new/infer.py \
--config-dir examples/hubert/config/decode --config-name infer_viterbi \
common_eval.path="${ckpt_path}" task.data="${test_dir}" task.normalize=true \
decoding.results_path="${test_dir}/pred" \
common_eval.results_path="${test_dir}/pred" \
common_eval.quiet=false dataset.gen_subset="${split}"
if diff -q "${test_dir}/pred/hypo.word" "${test_dir}/${split}.${size}.hypo.word" &>/dev/null; then
echo "...passed word check"
else
echo "...failed word check"
fi
rm -rf "${test_dir}/pred"
}
for size in $sizes; do
check_asr "$size" "${ckpt_urls[$size]}"
done
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
def update_state(state):
state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
state["args"].task = "hubert_pretraining"
state["args"].labels = f"['{state['args'].labels}']"
return state
src_state = torch.load(src_ckpt)
src_state = update_state(src_state)
torch.save(src_state, new_ckpt)
# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
## Training a joint alignment-translation model on WMT'18 En-De
##### 1. Extract and preprocess the WMT'18 En-De data
```bash
./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
```
##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
In this example, we use FastAlign.
```bash
git clone git@github.com:clab/fast_align.git
pushd fast_align
mkdir build
cd build
cmake ..
make
popd
ALIGN=fast_align/build/fast_align
paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
```
##### 3. Preprocess the dataset with the above generated alignments.
```bash
fairseq-preprocess \
--source-lang en --target-lang de \
--trainpref bpe.32k/train \
--validpref bpe.32k/valid \
--testpref bpe.32k/test \
--align-suffix align \
--destdir binarized/ \
--joined-dictionary \
--workers 32
```
##### 4. Train a model
```bash
fairseq-train \
binarized \
--arch transformer_wmt_en_de_big_align --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
--lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 3500 --label-smoothing 0.1 \
--save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
--keep-interval-updates -1 --save-interval-updates 0 \
--load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
--fp16
```
Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
If you want to train the above model with big batches (assuming your machine has 8 GPUs):
- add `--update-freq 8` to simulate training on 8x8=64 GPUs
- increase the learning rate; 0.0007 works well for big batches
##### 5. Evaluate and generate the alignments (BPE level)
```bash
fairseq-generate \
binarized --gen-subset test --print-alignment \
--source-lang en --target-lang de \
--path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
```
##### 6. Other resources.
The code for:
1. preparing alignment test sets
2. converting BPE level alignments to token level alignments
3. symmetrizing bidirectional alignments
4. evaluating alignments using AER metric
can be found [here](https://github.com/lilt/alignment-scripts)
## Citation
```bibtex
@inproceedings{garg2019jointly,
title = {Jointly Learning to Align and Translate with Transformer Models},
author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
address = {Hong Kong},
month = {November},
url = {https://arxiv.org/abs/1909.02074},
year = {2019},
}
```
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
URLS=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
"http://statmt.org/wmt14/test-full.tgz"
)
CORPORA=(
"training/europarl-v7.de-en"
"commoncrawl.de-en"
"training-parallel-nc-v13/news-commentary-v13.de-en"
"rapid2016.de-en"
)
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src=en
tgt=de
lang=en-de
prep=wmt18_en_de
tmp=$prep/tmp
orig=orig
dev=dev/newstest2012
codes=32000
bpe=bpe.32k
mkdir -p $orig $tmp $prep $bpe
cd $orig
for ((i=0;i<${#URLS[@]};++i)); do
url=${URLS[i]}
file=$(basename $url)
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit 1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
fi
fi
done
cd ..
echo "pre-processing train data..."
for l in $src $tgt; do
rm -rf $tmp/train.tags.$lang.tok.$l
for f in "${CORPORA[@]}"; do
cat $orig/$f.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
done
done
echo "pre-processing test data..."
for l in $src $tgt; do
if [ "$l" == "$src" ]; then
t="src"
else
t="ref"
fi
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
sed -e 's/<seg id="[0-9]*">\s*//g' | \
sed -e 's/\s*<\/seg>\s*//g' | \
sed -e "s/\’/\'/g" | \
perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
echo ""
done
# apply length filtering before BPE
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
# use newstest2012 for valid
echo "pre-processing valid data..."
for l in $src $tgt; do
rm -rf $tmp/valid.$l
cat $orig/$dev.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
done
mkdir output
mv $tmp/{train,valid,test}.{$src,$tgt} output
#BPE
git clone https://github.com/glample/fastBPE.git
pushd fastBPE
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
popd
fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done
# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
## Pre-trained models
Description | Parameters | Dataset | Model and Test set(s)
---|---:|---|---
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
## Training an LM with adaptive inputs
First, see the general [language modeling README](README.md) for instructions on
preprocessing the WikiText-103 data.
Then use the following training command to train a model with adaptive inputs
using the `transformer_lm_wiki103` model architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm_wiki103 \
--max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
--warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
--criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
--sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
```
## Citation
```bibtex
@inproceedings{
baevski2018adaptive,
title={Adaptive Input Representations for Neural Language Modeling},
author={Alexei Baevski and Michael Auli},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=ByxZX20qFQ},
}
```
# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
## Example usage
First download and preprocess the data following the main [language modeling README](README.md).
Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/fconv_wikitext-103 \
--arch fconv_lm_dauphin_wikitext103 \
--adaptive-softmax-cutoff 10000,20000,200000 \
--dropout 0.2 \
--criterion adaptive_loss \
--optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
--lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
--max-tokens 1024 --tokens-per-sample 1024 \
--ddp-backend legacy_ddp \
--max-epoch 35
```
And evaluate with:
```bash
fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
```
## Citation
```bibtex
@inproceedings{dauphin2017language,
title={Language Modeling with Gated Convolutional Networks},
author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
pages={933--941},
year={2017},
organization={JMLR}
}
```
# Neural Language Modeling
## Pre-trained models
Model | Description | Dataset | Download
---|---|---|---
`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
## Example usage
We require a few additional Python dependencies for preprocessing:
```bash
pip install fastBPE sacremoses
```
To sample from a language model using PyTorch Hub:
```python
import torch
# List available models
torch.hub.list('pytorch/fairseq') # [..., 'transformer_lm.wmt19.en', ...]
# Load an English LM trained on WMT'19 News Crawl data
en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
en_lm.eval() # disable dropout
# Move model to GPU
en_lm.cuda()
# Sample from the language model
en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
# "Barack Obama is coming to Sydney and New Zealand (...)"
# Compute perplexity for a sequence
en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
# tensor(15.1474)
# The same interface can be used with custom models as well
from fairseq.models.transformer_lm import TransformerLanguageModel
custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
custom_lm.sample('Barack Obama', beam=5)
# "Barack Obama (...)"
```
## Training a transformer language model with the CLI tools
### 1) Preprocess the data
First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
```bash
cd examples/language_model/
bash prepare-wikitext-103.sh
cd ../..
```
Next preprocess/binarize the data:
```bash
TEXT=examples/language_model/wikitext-103
fairseq-preprocess \
--only-source \
--trainpref $TEXT/wiki.train.tokens \
--validpref $TEXT/wiki.valid.tokens \
--testpref $TEXT/wiki.test.tokens \
--destdir data-bin/wikitext-103 \
--workers 20
```
### 2) Train a language model
Next we'll train a basic transformer language model on wikitext-103. For more
advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
To train a basic LM (assumes 2 GPUs):
```
$ fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm --share-decoder-input-output-embed \
--dropout 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
--lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--tokens-per-sample 512 --sample-break-mode none \
--max-tokens 2048 --update-freq 16 \
--fp16 \
--max-update 50000
```
If you run out of memory, try reducing `--max-tokens` (max number of tokens per
batch) or `--tokens-per-sample` (max sequence length). You can also adjust
`--update-freq` to accumulate gradients and simulate training on a different
number of GPUs.
### 3) Evaluate
```bash
fairseq-eval-lm data-bin/wikitext-103 \
--path checkpoints/transformer_wiki103/checkpoint_best.pt \
--batch-size 2 \
--tokens-per-sample 512 \
--context-window 400
# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
# | Loss: 3.4164, Perplexity: 30.46
```
*Note:* The `--context-window` option controls how much context is provided to
each token when computing perplexity. When the window size is 0, the dataset is
chunked into segments of length 512 and perplexity is computed over each segment
normally. However, this results in worse (higher) perplexity since tokens that
appear earlier in each segment have less conditioning. When the maximum window
size is used (511 in this case), then we compute perplexity for each token
fully conditioned on 511 tokens of context. This slows down evaluation
significantly, since we must run a separate forward pass for every token in the
dataset, but results in better (lower) perplexity.
## Convolutional language models
Please see the [convolutional LM README](README.conv.md) for instructions on
training convolutional language models.
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
URLS=(
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
)
FILES=(
"wikitext-103-v1.zip"
)
for ((i=0;i<${#URLS[@]};++i)); do
file=${FILES[i]}
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
url=${URLS[i]}
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit -1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
elif [ ${file: -4} == ".zip" ]; then
unzip $file
fi
fi
done
cd ..
# LASER Language-Agnostic SEntence Representations
LASER is a library to calculate and use multilingual sentence embeddings.
You can find more information about LASER and how to use it on the official [LASER repository](https://github.com/facebookresearch/LASER).
This folder contains source code for training LASER embeddings.
## Prepare data and configuration file
Binarize your data with fairseq, as described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing).
Create a json config file with this format:
```
{
"src_vocab": "/path/to/spm.src.cvocab",
"tgt_vocab": "/path/to/spm.tgt.cvocab",
"train": [
{
"type": "translation",
"id": 0,
"src": "/path/to/srclang1-tgtlang0/train.srclang1",
"tgt": "/path/to/srclang1-tgtlang0/train.tgtlang0"
},
{
"type": "translation",
"id": 1,
"src": "/path/to/srclang1-tgtlang1/train.srclang1",
"tgt": "/path/to/srclang1-tgtlang1/train.tgtlang1"
},
{
"type": "translation",
"id": 0,
"src": "/path/to/srclang2-tgtlang0/train.srclang2",
"tgt": "/path/to/srclang2-tgtlang0/train.tgtlang0"
},
{
"type": "translation",
"id": 1,
"src": "/path/to/srclang2-tgtlang1/train.srclang2",
"tgt": "/path/to/srclang2-tgtlang1/train.tgtlang1"
},
...
],
"valid": [
{
"type": "translation",
"id": 0,
"src": "/unused",
"tgt": "/unused"
}
]
}
```
where paths are paths to binarized indexed fairseq dataset files.
`id` represents the target language id.
## Training Command Line Example
```
fairseq-train \
/path/to/configfile_described_above.json \
--user-dir examples/laser/laser_src \
--log-interval 100 --log-format simple \
--task laser --arch laser_lstm \
--save-dir . \
--optimizer adam \
--lr 0.001 \
--lr-scheduler inverse_sqrt \
--clip-norm 5 \
--warmup-updates 90000 \
--update-freq 2 \
--dropout 0.0 \
--encoder-dropout-out 0.1 \
--max-tokens 2000 \
--max-epoch 50 \
--encoder-bidirectional \
--encoder-layers 5 \
--encoder-hidden-size 512 \
--decoder-layers 1 \
--decoder-hidden-size 2048 \
--encoder-embed-dim 320 \
--decoder-embed-dim 320 \
--decoder-lang-embed-dim 32 \
--warmup-init-lr 0.001 \
--disable-validation
```
## Applications
We showcase several applications of multilingual sentence embeddings
with code to reproduce our results (in the directory "tasks").
* [**Cross-lingual document classification**](https://github.com/facebookresearch/LASER/tree/master/tasks/mldoc) using the
[*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6]
* [**WikiMatrix**](https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix)
Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7]
* [**Bitext mining**](https://github.com/facebookresearch/LASER/tree/master/tasks/bucc) using the
[*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5]
* [**Cross-lingual NLI**](https://github.com/facebookresearch/LASER/tree/master/tasks/xnli)
using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6]
* [**Multilingual similarity search**](https://github.com/facebookresearch/LASER/tree/master/tasks/similarity) [1,6]
* [**Sentence embedding of text files**](https://github.com/facebookresearch/LASER/tree/master/tasks/embed)
example how to calculate sentence embeddings for arbitrary text files in any of the supported language.
**For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.**
## References
[1] Holger Schwenk and Matthijs Douze,
[*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619),
ACL workshop on Representation Learning for NLP, 2017
[2] Holger Schwenk and Xian Li,
[*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf),
LREC, pages 3548-3551, 2018.
[3] Holger Schwenk,
[*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037)
ACL, July 2018
[4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov,
[*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269),
EMNLP, 2018.
[5] Mikel Artetxe and Holger Schwenk,
[*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
arXiv, Nov 3 2018.
[6] Mikel Artetxe and Holger Schwenk,
[*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
arXiv, Dec 26 2018.
[7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
[*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
arXiv, July 11 2019.
[8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin
[*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from .laser_task import * # noqa
from .laser_lstm import * # noqa
from .laser_transformer import * # noqa
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqEncoderDecoderModel,
register_model,
register_model_architecture,
)
@register_model("laser_lstm")
class LSTMModel(FairseqEncoderDecoderModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
def forward(
self,
src_tokens,
src_lengths,
prev_output_tokens=None,
tgt_tokens=None,
tgt_lengths=None,
target_language_id=None,
dataset_name="",
):
assert target_language_id is not None
src_encoder_out = self.encoder(src_tokens, src_lengths, dataset_name)
return self.decoder(
prev_output_tokens, src_encoder_out, lang_id=target_language_id
)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument(
"--dropout",
default=0.1,
type=float,
metavar="D",
help="dropout probability",
)
parser.add_argument(
"--encoder-embed-dim",
type=int,
metavar="N",
help="encoder embedding dimension",
)
parser.add_argument(
"--encoder-embed-path",
default=None,
type=str,
metavar="STR",
help="path to pre-trained encoder embedding",
)
parser.add_argument(
"--encoder-hidden-size", type=int, metavar="N", help="encoder hidden size"
)
parser.add_argument(
"--encoder-layers", type=int, metavar="N", help="number of encoder layers"
)
parser.add_argument(
"--encoder-bidirectional",
action="store_true",
help="make all layers of encoder bidirectional",
)
parser.add_argument(
"--decoder-embed-dim",
type=int,
metavar="N",
help="decoder embedding dimension",
)
parser.add_argument(
"--decoder-embed-path",
default=None,
type=str,
metavar="STR",
help="path to pre-trained decoder embedding",
)
parser.add_argument(
"--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size"
)
parser.add_argument(
"--decoder-layers", type=int, metavar="N", help="number of decoder layers"
)
parser.add_argument(
"--decoder-out-embed-dim",
type=int,
metavar="N",
help="decoder output embedding dimension",
)
parser.add_argument(
"--decoder-zero-init",
type=str,
metavar="BOOL",
help="initialize the decoder hidden/cell state to zero",
)
parser.add_argument(
"--decoder-lang-embed-dim",
type=int,
metavar="N",
help="decoder language embedding dimension",
)
parser.add_argument(
"--fixed-embeddings",
action="store_true",
help="keep embeddings fixed (ENCODER ONLY)",
) # TODO Also apply to decoder embeddings?
# Granular dropout settings (if not specified these default to --dropout)
parser.add_argument(
"--encoder-dropout-in",
type=float,
metavar="D",
help="dropout probability for encoder input embedding",
)
parser.add_argument(
"--encoder-dropout-out",
type=float,
metavar="D",
help="dropout probability for encoder output",
)
parser.add_argument(
"--decoder-dropout-in",
type=float,
metavar="D",
help="dropout probability for decoder input embedding",
)
parser.add_argument(
"--decoder-dropout-out",
type=float,
metavar="D",
help="dropout probability for decoder output",
)
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure that all args are properly defaulted (in case there are any new ones)
base_architecture(args)
def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
embed_dict = utils.parse_embedding(embed_path)
utils.print_embed_overlap(embed_dict, dictionary)
return utils.load_embedding(embed_dict, dictionary, embed_tokens)
pretrained_encoder_embed = None
if args.encoder_embed_path:
pretrained_encoder_embed = load_pretrained_embedding_from_file(
args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
)
pretrained_decoder_embed = None
if args.decoder_embed_path:
pretrained_decoder_embed = load_pretrained_embedding_from_file(
args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim
)
num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
encoder = LSTMEncoder(
dictionary=task.source_dictionary,
embed_dim=args.encoder_embed_dim,
hidden_size=args.encoder_hidden_size,
num_layers=args.encoder_layers,
dropout_in=args.encoder_dropout_in,
dropout_out=args.encoder_dropout_out,
bidirectional=args.encoder_bidirectional,
pretrained_embed=pretrained_encoder_embed,
fixed_embeddings=args.fixed_embeddings,
)
decoder = LSTMDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
hidden_size=args.decoder_hidden_size,
out_embed_dim=args.decoder_out_embed_dim,
num_layers=args.decoder_layers,
dropout_in=args.decoder_dropout_in,
dropout_out=args.decoder_dropout_out,
zero_init=options.eval_bool(args.decoder_zero_init),
encoder_embed_dim=args.encoder_embed_dim,
encoder_output_units=encoder.output_units,
pretrained_embed=pretrained_decoder_embed,
num_langs=num_langs,
lang_embed_dim=args.decoder_lang_embed_dim,
)
return cls(encoder, decoder)
class LSTMEncoder(FairseqEncoder):
"""LSTM encoder."""
def __init__(
self,
dictionary,
embed_dim=512,
hidden_size=512,
num_layers=1,
dropout_in=0.1,
dropout_out=0.1,
bidirectional=False,
left_pad=True,
pretrained_embed=None,
padding_value=0.0,
fixed_embeddings=False,
):
super().__init__(dictionary)
self.num_layers = num_layers
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.bidirectional = bidirectional
self.hidden_size = hidden_size
num_embeddings = len(dictionary)
self.padding_idx = dictionary.pad()
if pretrained_embed is None:
self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
else:
self.embed_tokens = pretrained_embed
if fixed_embeddings:
self.embed_tokens.weight.requires_grad = False
self.lstm = LSTM(
input_size=embed_dim,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=self.dropout_out if num_layers > 1 else 0.0,
bidirectional=bidirectional,
)
self.left_pad = left_pad
self.padding_value = padding_value
self.output_units = hidden_size
if bidirectional:
self.output_units *= 2
def forward(self, src_tokens, src_lengths, dataset_name):
if self.left_pad:
# convert left-padding to right-padding
src_tokens = utils.convert_padding_direction(
src_tokens,
self.padding_idx,
left_to_right=True,
)
bsz, seqlen = src_tokens.size()
# embed tokens
x = self.embed_tokens(src_tokens)
x = F.dropout(x, p=self.dropout_in, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# pack embedded source tokens into a PackedSequence
try:
packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
except BaseException:
raise Exception(f"Packing failed in dataset {dataset_name}")
# apply LSTM
if self.bidirectional:
state_size = 2 * self.num_layers, bsz, self.hidden_size
else:
state_size = self.num_layers, bsz, self.hidden_size
h0 = x.data.new(*state_size).zero_()
c0 = x.data.new(*state_size).zero_()
packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
# unpack outputs and apply dropout
x, _ = nn.utils.rnn.pad_packed_sequence(
packed_outs, padding_value=self.padding_value
)
x = F.dropout(x, p=self.dropout_out, training=self.training)
assert list(x.size()) == [seqlen, bsz, self.output_units]
if self.bidirectional:
def combine_bidir(outs):
return torch.cat(
[
torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
1, bsz, self.output_units
)
for i in range(self.num_layers)
],
dim=0,
)
final_hiddens = combine_bidir(final_hiddens)
final_cells = combine_bidir(final_cells)
encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
# Set padded outputs to -inf so they are not selected by max-pooling
padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
if padding_mask.any():
x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
# Build the sentence embedding by max-pooling over the encoder outputs
sentemb = x.max(dim=0)[0]
return {
"sentemb": sentemb,
"encoder_out": (x, final_hiddens, final_cells),
"encoder_padding_mask": encoder_padding_mask
if encoder_padding_mask.any()
else None,
}
def reorder_encoder_out(self, encoder_out_dict, new_order):
encoder_out_dict["sentemb"] = encoder_out_dict["sentemb"].index_select(
0, new_order
)
encoder_out_dict["encoder_out"] = tuple(
eo.index_select(1, new_order) for eo in encoder_out_dict["encoder_out"]
)
if encoder_out_dict["encoder_padding_mask"] is not None:
encoder_out_dict["encoder_padding_mask"] = encoder_out_dict[
"encoder_padding_mask"
].index_select(1, new_order)
return encoder_out_dict
def max_positions(self):
"""Maximum input length supported by the encoder."""
return int(1e5) # an arbitrary large number
class LSTMDecoder(FairseqIncrementalDecoder):
"""LSTM decoder."""
def __init__(
self,
dictionary,
embed_dim=512,
hidden_size=512,
out_embed_dim=512,
num_layers=1,
dropout_in=0.1,
dropout_out=0.1,
zero_init=False,
encoder_embed_dim=512,
encoder_output_units=512,
pretrained_embed=None,
num_langs=1,
lang_embed_dim=0,
):
super().__init__(dictionary)
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.hidden_size = hidden_size
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
if pretrained_embed is None:
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
else:
self.embed_tokens = pretrained_embed
self.layers = nn.ModuleList(
[
LSTMCell(
input_size=encoder_output_units + embed_dim + lang_embed_dim
if layer == 0
else hidden_size,
hidden_size=hidden_size,
)
for layer in range(num_layers)
]
)
if hidden_size != out_embed_dim:
self.additional_fc = Linear(hidden_size, out_embed_dim)
self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
if zero_init:
self.sentemb2init = None
else:
self.sentemb2init = Linear(
encoder_output_units, 2 * num_layers * hidden_size
)
if lang_embed_dim == 0:
self.embed_lang = None
else:
self.embed_lang = nn.Embedding(num_langs, lang_embed_dim)
nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1)
def forward(
self, prev_output_tokens, encoder_out_dict, incremental_state=None, lang_id=0
):
sentemb = encoder_out_dict["sentemb"]
encoder_out = encoder_out_dict["encoder_out"]
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
bsz, seqlen = prev_output_tokens.size()
# get outputs from encoder
encoder_outs, _, _ = encoder_out[:3]
srclen = encoder_outs.size(0)
# embed tokens
x = self.embed_tokens(prev_output_tokens)
x = F.dropout(x, p=self.dropout_in, training=self.training)
# embed language identifier
if self.embed_lang is not None:
lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id)
langemb = self.embed_lang(lang_ids)
# TODO Should we dropout here???
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# initialize previous states (or get from cache during incremental generation)
cached_state = utils.get_incremental_state(
self, incremental_state, "cached_state"
)
if cached_state is not None:
prev_hiddens, prev_cells, input_feed = cached_state
else:
num_layers = len(self.layers)
if self.sentemb2init is None:
prev_hiddens = [
x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers)
]
prev_cells = [
x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers)
]
else:
init = self.sentemb2init(sentemb)
prev_hiddens = [
init[:, (2 * i) * self.hidden_size : (2 * i + 1) * self.hidden_size]
for i in range(num_layers)
]
prev_cells = [
init[
:,
(2 * i + 1) * self.hidden_size : (2 * i + 2) * self.hidden_size,
]
for i in range(num_layers)
]
input_feed = x.data.new(bsz, self.hidden_size).zero_()
attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
outs = []
for j in range(seqlen):
if self.embed_lang is None:
input = torch.cat((x[j, :, :], sentemb), dim=1)
else:
input = torch.cat((x[j, :, :], sentemb, langemb), dim=1)
for i, rnn in enumerate(self.layers):
# recurrent cell
hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
# hidden state becomes the input to the next layer
input = F.dropout(hidden, p=self.dropout_out, training=self.training)
# save state for next time step
prev_hiddens[i] = hidden
prev_cells[i] = cell
out = hidden
out = F.dropout(out, p=self.dropout_out, training=self.training)
# input feeding
input_feed = out
# save final output
outs.append(out)
# cache previous states (no-op except during incremental generation)
utils.set_incremental_state(
self,
incremental_state,
"cached_state",
(prev_hiddens, prev_cells, input_feed),
)
# collect outputs across time steps
x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# srclen x tgtlen x bsz -> bsz x tgtlen x srclen
attn_scores = attn_scores.transpose(0, 2)
# project back to size of vocabulary
if hasattr(self, "additional_fc"):
x = self.additional_fc(x)
x = F.dropout(x, p=self.dropout_out, training=self.training)
x = self.fc_out(x)
return x, attn_scores
def reorder_incremental_state(self, incremental_state, new_order):
super().reorder_incremental_state(incremental_state, new_order)
cached_state = utils.get_incremental_state(
self, incremental_state, "cached_state"
)
if cached_state is None:
return
def reorder_state(state):
if isinstance(state, list):
return [reorder_state(state_i) for state_i in state]
return state.index_select(0, new_order)
new_state = tuple(map(reorder_state, cached_state))
utils.set_incremental_state(self, incremental_state, "cached_state", new_state)
def max_positions(self):
"""Maximum output length supported by the decoder."""
return int(1e5) # an arbitrary large number
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.uniform_(m.weight, -0.1, 0.1)
nn.init.constant_(m.weight[padding_idx], 0)
return m
def LSTM(input_size, hidden_size, **kwargs):
m = nn.LSTM(input_size, hidden_size, **kwargs)
for name, param in m.named_parameters():
if "weight" in name or "bias" in name:
param.data.uniform_(-0.1, 0.1)
return m
def LSTMCell(input_size, hidden_size, **kwargs):
m = nn.LSTMCell(input_size, hidden_size, **kwargs)
for name, param in m.named_parameters():
if "weight" in name or "bias" in name:
param.data.uniform_(-0.1, 0.1)
return m
def Linear(in_features, out_features, bias=True, dropout=0):
"""Weight-normalized Linear layer (input: N x T x C)"""
m = nn.Linear(in_features, out_features, bias=bias)
m.weight.data.uniform_(-0.1, 0.1)
if bias:
m.bias.data.uniform_(-0.1, 0.1)
return m
@register_model_architecture("laser_lstm", "laser_lstm")
def base_architecture(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
args.encoder_hidden_size = getattr(
args, "encoder_hidden_size", args.encoder_embed_dim
)
args.encoder_layers = getattr(args, "encoder_layers", 1)
args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False)
args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout)
args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
args.decoder_hidden_size = getattr(
args, "decoder_hidden_size", args.decoder_embed_dim
)
args.decoder_layers = getattr(args, "decoder_layers", 1)
args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
args.decoder_zero_init = getattr(args, "decoder_zero_init", "0")
args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0)
args.fixed_embeddings = getattr(args, "fixed_embeddings", False)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections import OrderedDict, defaultdict
import json
import os
import logging
from argparse import ArgumentError
from fairseq import options, models
from fairseq.data import (
data_utils,
Dictionary,
LanguagePairDataset,
IndexedDataset,
FairseqDataset,
)
from .multitask_data_utils import (
MultitaskDatasetWrapper,
MultidatasetEpochBatchIterator,
)
from fairseq.tasks import LegacyFairseqTask, register_task
logger = logging.getLogger(__name__)
@register_task("laser")
class LaserTask(LegacyFairseqTask):
@staticmethod
def add_args(parser):
"""Add task-specific arguments to the parser."""
parser.add_argument(
"configfile", metavar="PATH", help="dataset configuration file in json"
)
parser.add_argument(
"--weighting-alpha",
type=float,
default=None,
help="alpha for automatic weighting",
)
parser.add_argument(
"--raw-text", action="store_true", help="load raw text dataset"
)
parser.add_argument(
"--left-pad-source",
default="True",
type=str,
metavar="BOOL",
help="pad the source on the left (default: True)",
)
parser.add_argument(
"--left-pad-target",
default="False",
type=str,
metavar="BOOL",
help="pad the target on the left (default: False)",
)
try:
parser.add_argument(
"--max-source-positions",
default=1024,
type=int,
metavar="N",
help="max number of tokens in the source sequence",
)
parser.add_argument(
"--max-target-positions",
default=1024,
type=int,
metavar="N",
help="max number of tokens in the target sequence",
)
except ArgumentError:
# this might have already been defined. Once we transition this to hydra it should be fine to add it here.
pass
def __init__(self, args, config, src_dictionary, tgt_dictionary, num_tasks):
super().__init__(args)
self.config = config
self.src_dictionary = src_dictionary
self.tgt_dictionary = tgt_dictionary
self.num_tasks = num_tasks
@classmethod
def setup_task(cls, args, **kwargs):
with open(args.configfile, "r") as f:
config = json.load(f)
num_tasks = max(dataset["id"] for dataset in config["train"]) + 1
args.left_pad_source = options.eval_bool(args.left_pad_source)
args.left_pad_target = options.eval_bool(args.left_pad_target)
src_dictionary = Dictionary.load(config["src_vocab"])
tgt_dictionary = Dictionary.load(config["tgt_vocab"])
logger.info(
"| src Dictionary {} : {} types".format(
config["src_vocab"], len(src_dictionary)
)
)
logger.info(
"| tgt Dictionary {} : {} types".format(
config["tgt_vocab"], len(tgt_dictionary)
)
)
return cls(args, config, src_dictionary, tgt_dictionary, num_tasks)
# Experimental overriding for backtranslation
def build_model(self, args, from_checkpoint=False):
model = models.build_model(args, self)
return model
def dataset(self, split):
if split not in self.datasets:
raise KeyError("Dataset not loaded: " + split)
return self.datasets[split]
def load_dataset(self, split, epoch=1, **kwargs):
"""Load a dataset split."""
def indexed_dataset(path, dictionary):
if self.args.raw_text:
raise Exception("Unable to handle raw text.")
dataset = IndexedDataset(path, fix_lua_indexing=True)
return dataset
pair_datasets = OrderedDict()
if split == "valid":
self.datasets[split] = pair_datasets
return
if split not in self.config:
raise FileNotFoundError(
"Dataset not found in config file: {}".format(split)
)
size_by_corpus = defaultdict(int)
size_sum = 0
size_sum_with_subsampling = 0
init_pair_datasets = {}
for dataset_config in self.config[split]:
src_path = os.path.dirname(dataset_config["src"])
corpus_name = src_path.split("/")[-2]
language_pair_name = src_path.split("/")[-1]
pair_datasets_key = corpus_name + "-" + language_pair_name
logger.info(f"loading... {pair_datasets_key}")
if "src" in dataset_config:
src_dataset = indexed_dataset(
dataset_config["src"], self.src_dictionary
)
else:
src_dataset = None
if "tgt" in dataset_config:
tgt_dataset = indexed_dataset(
dataset_config["tgt"], self.tgt_dictionary
)
else:
tgt_dataset = None
dataset = LanguagePairDataset(
src_dataset,
src_dataset.sizes,
self.src_dictionary,
tgt_dataset,
tgt_dataset.sizes,
self.tgt_dictionary,
left_pad_source=self.args.left_pad_source,
left_pad_target=self.args.left_pad_target,
)
if pair_datasets_key in init_pair_datasets:
logger.warning(
f"Ignoring already added {pair_datasets_key}. "
f"Consider using `sample` key in order to upsample."
)
else:
init_pair_datasets[pair_datasets_key] = {
"dataset": dataset,
"sample": dataset_config.get("sample", None),
"id": dataset_config.get("id", None),
"len": len(dataset),
}
length_sum = 0
weighted_freqs_sum = 0
freq_per_dataset = {}
vmax = 0
vmin = 1
weighted_freq_per_dataset = {}
if self.args.weighting_alpha:
for key in init_pair_datasets:
if init_pair_datasets[key]["sample"] is None:
length_sum += len(init_pair_datasets[key]["dataset"])
for key in init_pair_datasets:
if init_pair_datasets[key]["sample"] is None:
val = float(init_pair_datasets[key]["len"]) / length_sum
freq_per_dataset[key] = val
weighted_freqs_sum += val ** self.args.weighting_alpha
for key in freq_per_dataset:
val = (
freq_per_dataset[key] ** self.args.weighting_alpha
/ weighted_freqs_sum
)
vmin = min(vmin, val)
vmax = max(vmax, val)
weighted_freq_per_dataset[key] = val
for pair_datasets_key in init_pair_datasets:
dataset_config = init_pair_datasets[pair_datasets_key]
dataset = dataset_config["dataset"]
sample = dataset_config["sample"]
if sample is None:
sample = 1.0
if pair_datasets_key in weighted_freq_per_dataset:
w = vmax / weighted_freq_per_dataset[pair_datasets_key]
sample = w
sample = round(sample)
initial_sample = sample
initial_pair_datasets_key = pair_datasets_key
while sample >= 1.0:
assert (
pair_datasets_key not in pair_datasets
), f"{pair_datasets_key} already in"
size_sum_with_subsampling += len(dataset)
pair_datasets[pair_datasets_key] = MultitaskDatasetWrapper(
dataset, dataset_config.get("id", 0), 1.0, name=pair_datasets_key
)
size_sum += len(dataset)
sample -= 1.0
pair_datasets_key += "-up"
assert sample < 1e-6, f"sample remains > 0 {pair_datasets_key}"
logger.info(
f"added pair {initial_pair_datasets_key} length {len(dataset)} new_length = {len(dataset)*initial_sample}"
)
size_by_corpus[corpus_name] += len(dataset)
self.datasets[split] = pair_datasets
logger.info(
f"Datasets number = {len(self.datasets[split])} size = {size_sum} size_sum_with_subsampling = {size_sum_with_subsampling}"
)
@property
def source_dictionary(self):
return self.src_dictionary
@property
def target_dictionary(self):
return self.tgt_dictionary
def get_batch_iterator(
self,
dataset,
max_tokens=None,
max_sentences=None,
max_positions=None,
ignore_invalid_inputs=False,
required_batch_size_multiple=1,
seed=1,
num_shards=1,
shard_id=0,
num_workers=0,
epoch=1,
data_buffer_size=0,
disable_iterator_cache=False,
grouped_shuffling=False,
update_epoch_batch_itr=False,
**kwargs,
):
assert isinstance(dataset, OrderedDict)
assert len(dataset)
assert isinstance(dataset[next(iter(dataset))], FairseqDataset)
# initialize the dataset with the correct starting epoch
for _, dt in dataset.items():
dt.set_epoch(epoch)
indices = OrderedDict()
batch_sampler = OrderedDict()
with data_utils.numpy_seed(seed + epoch):
for key, dt in dataset.items():
logger.info(f"\t ordered_indices {key}")
indices[key] = dt.ordered_indices()
# filter examples that are too large
if max_positions is not None:
for key, dt in dataset.items():
logger.info(f"\t filter_by_size {key}")
indices[key], ignored = dt.filter_indices_by_size(
indices[key], max_positions
)
for key, dt in dataset.items():
logger.info(f"\t batch_by_size {key}")
batch_sampler[key] = data_utils.batch_by_size(
indices[key],
dt.num_tokens,
max_tokens=max_tokens,
max_sentences=max_sentences,
required_batch_size_multiple=required_batch_size_multiple,
)
epoch_iter = MultidatasetEpochBatchIterator(
dataset=dataset,
batch_sampler=batch_sampler,
seed=seed,
num_shards=num_shards,
shard_id=shard_id,
num_workers=num_workers,
epoch=epoch,
)
return epoch_iter
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from typing import Any, Dict, List, Optional
from torch import Tensor
import torch
import torch.nn as nn
from fairseq.models import (
FairseqEncoderDecoderModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
base_architecture,
Embedding,
TransformerModel,
TransformerEncoder,
TransformerDecoder,
)
from fairseq.modules import (
TransformerDecoderLayer,
)
logger = logging.getLogger(__name__)
@register_model("laser_transformer")
class LaserTransformerModel(FairseqEncoderDecoderModel):
"""Train Transformer for LASER task
Requires --task laser
"""
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
def forward(
self,
src_tokens,
src_lengths,
prev_output_tokens=None,
tgt_tokens=None,
tgt_lengths=None,
target_language_id=-1,
dataset_name="",
):
laser_encoder_out = self.encoder(src_tokens, src_lengths)
return self.decoder(
prev_output_tokens, laser_encoder_out, lang_id=target_language_id
)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
TransformerModel.add_args(parser)
parser.add_argument(
"--decoder-lang-embed-dim",
type=int,
metavar="N",
help="decoder language embedding dimension",
)
@classmethod
def build_model(cls, args, task):
base_laser_transformer_architecture(args)
num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
def load_embed_tokens(dictionary, embed_dim):
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
return Embedding(num_embeddings, embed_dim, padding_idx)
encoder_embed_tokens = load_embed_tokens(
task.source_dictionary, args.encoder_embed_dim
)
decoder_embed_tokens = load_embed_tokens(
task.target_dictionary, args.decoder_embed_dim
)
num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
encoder = LaserTransformerEncoder(
args, task.source_dictionary, encoder_embed_tokens
)
decoder = LaserTransformerDecoder(
args,
task.target_dictionary,
decoder_embed_tokens,
num_langs=num_langs,
lang_embed_dim=args.decoder_lang_embed_dim,
)
return cls(encoder, decoder)
class LaserTransformerEncoder(TransformerEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, src_tokens, *args, **kwargs):
encoder_out = super().forward(src_tokens, *args, **kwargs)
x = encoder_out["encoder_out"][0] # T x B x C
padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
if padding_mask.any():
x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
# Build the sentence embedding by max-pooling over the encoder outputs
sentemb = x.max(dim=0)[0]
# The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
# `foward` so we use a dictionary instead.
# TorchScript does not support mixed values so the values are all lists.
# The empty list is equivalent to None.
return {"sentemb": [sentemb]} # B x C
@torch.jit.export
def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
"""
Same as the one in transformer.py, with new_sentemb
"""
if len(encoder_out["sentemb"]) == 0:
new_sentemb = []
else:
new_sentemb = [encoder_out["sentemb"][0].index_select(0, new_order)]
return {
"sentemb": new_sentemb, # B x C
}
class LaserTransformerDecoder(TransformerDecoder):
def __init__(self, args, dictionary, *kargs, **kwargs):
self.num_langs = kwargs.get("num_langs", 1)
self.lang_embed_dim = kwargs.get("lang_embed_dim", 0)
kwargs.pop("num_langs", None)
kwargs.pop("lang_embed_dim", None)
super().__init__(args, dictionary, *kargs, **kwargs, no_encoder_attn=True)
if self.lang_embed_dim == 0:
self.embed_lang = None
else:
self.embed_lang = nn.Embedding(self.num_langs, self.lang_embed_dim)
nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1)
if self.output_projection is not None:
laser_output_embed_dim = (
self.output_embed_dim + self.lang_embed_dim + args.encoder_embed_dim
)
self.output_projection = nn.Linear(
laser_output_embed_dim, len(dictionary), bias=False
)
nn.init.normal_(
self.output_projection.weight,
mean=0,
std=laser_output_embed_dim ** -0.5,
)
def build_decoder_layer(self, args, no_encoder_attn=False):
decoder_embed_dim = args.decoder_embed_dim
args.decoder_embed_dim = (
decoder_embed_dim + self.lang_embed_dim + args.encoder_embed_dim
)
res = TransformerDecoderLayer(args, no_encoder_attn=True)
args.decoder_embed_dim = decoder_embed_dim
return res
def extract_features(
self,
prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]],
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
full_context_alignment: bool = False,
alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None,
lang_id: Optional[int] = None,
):
"""
Similar to *forward* but only return features.
Includes several features from "Jointly Learning to Align and
Translate with Transformer Models" (Garg et al., EMNLP 2019).
Args:
full_context_alignment (bool, optional): don't apply
auto-regressive mask to self-attention (default: False).
alignment_layer (int, optional): return mean alignment over
heads at this layer (default: last layer).
alignment_heads (int, optional): only average alignment over
this many heads (default: all heads).
Returns:
tuple:
- the decoder's features of shape `(batch, tgt_len, embed_dim)`
- a dictionary with any model-specific outputs
"""
if alignment_layer is None:
alignment_layer = self.num_layers - 1
# embed positions
positions = (
self.embed_positions(
prev_output_tokens, incremental_state=incremental_state
)
if self.embed_positions is not None
else None
)
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
if positions is not None:
positions = positions[:, -1:]
bsz, seqlen = prev_output_tokens.size()
# embed tokens and positions
x = self.embed_scale * self.embed_tokens(prev_output_tokens)
if self.quant_noise is not None:
x = self.quant_noise(x)
if self.project_in_dim is not None:
x = self.project_in_dim(x)
if positions is not None:
x += positions
if self.layernorm_embedding is not None:
x = self.layernorm_embedding(x)
x = self.dropout_module(x)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
if self.embed_lang is not None:
lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id)
langemb = self.embed_lang(lang_ids)
langemb = langemb.unsqueeze(0)
repeat_vals = [x.shape[0] // langemb.shape[0]] + [-1] * (
len(langemb.shape) - 1
)
x = torch.cat((x, langemb.expand(*repeat_vals)), dim=-1)
sentemb = encoder_out["sentemb"][0]
sentemb = sentemb.unsqueeze(0)
repeat_vals = [x.shape[0] // sentemb.shape[0]] + [-1] * (len(sentemb.shape) - 1)
x = torch.cat((x, sentemb.expand(*repeat_vals)), dim=-1)
self_attn_padding_mask: Optional[Tensor] = None
if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
# decoder layers
attn: Optional[Tensor] = None
inner_states: List[Optional[Tensor]] = [x]
for idx, layer in enumerate(self.layers):
if incremental_state is None and not full_context_alignment:
self_attn_mask = self.buffered_future_mask(x)
else:
self_attn_mask = None
x, layer_attn, _ = layer(
x,
None,
None,
incremental_state,
self_attn_mask=self_attn_mask,
self_attn_padding_mask=self_attn_padding_mask,
need_attn=bool((idx == alignment_layer)),
need_head_weights=bool((idx == alignment_layer)),
)
inner_states.append(x)
if layer_attn is not None and idx == alignment_layer:
attn = layer_attn.float().to(x)
if attn is not None:
if alignment_heads is not None:
attn = attn[:alignment_heads]
# average probabilities over heads
attn = attn.mean(dim=0)
if self.layer_norm is not None:
x = self.layer_norm(x)
# T x B x C -> B x T x C
x = x.transpose(0, 1)
if self.project_out_dim is not None:
x = self.project_out_dim(x)
return x, {"attn": [attn], "inner_states": inner_states}
def forward(
self,
prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
features_only: bool = False,
alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None,
src_lengths: Optional[Any] = None,
return_all_hiddens: bool = False,
lang_id: Optional[int] = None,
):
"""
Args:
prev_output_tokens (LongTensor): previous decoder outputs of shape
`(batch, tgt_len)`, for teacher forcing
encoder_out (optional): output from the encoder, used for
encoder-side attention
incremental_state (dict): dictionary used for storing state during
:ref:`Incremental decoding`
features_only (bool, optional): only return features without
applying output layer (default: False).
Returns:
tuple:
- the decoder's output of shape `(batch, tgt_len, vocab)`
- a dictionary with any model-specific outputs
"""
assert lang_id is not None
x, extra = self.extract_features(
prev_output_tokens,
encoder_out=encoder_out,
incremental_state=incremental_state,
alignment_layer=alignment_layer,
alignment_heads=alignment_heads,
lang_id=lang_id,
)
if not features_only:
x = self.output_layer(x)
return x, extra
@register_model_architecture("laser_transformer", "laser_transformer")
def base_laser_transformer_architecture(args):
base_architecture(args)
args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections import OrderedDict
import numpy as np
from fairseq.data import BaseWrapperDataset, FairseqDataset, iterators
class MultiItr(object):
def __init__(self, itr):
self.itr = itr
self._counts = [0 for x in itr]
def __len__(self):
return sum(len(itr) for itr in self.itr)
def __iter__(self):
return self
def __next__(self):
ratios = [count / len(itr) for count, itr in zip(self._counts, self.itr)]
idx = ratios.index(min(ratios))
self._counts[idx] += 1
return next(self.itr[idx])
class MultidatasetEpochBatchIterator(iterators.EpochBatchIterating):
"""A wrapper around multiple epoch batch iterators."""
def __init__(
self,
dataset,
batch_sampler,
seed=1,
num_shards=1,
shard_id=0,
num_workers=0,
epoch=1,
):
assert isinstance(dataset, OrderedDict)
assert len(dataset)
assert isinstance(dataset[next(iter(dataset))], FairseqDataset)
self.iterators = []
self.epoch = epoch
for key, dt in dataset.items():
epoch_iter = iterators.EpochBatchIterator(
dataset=dt,
collate_fn=dt.collater,
batch_sampler=batch_sampler[key],
seed=seed,
num_shards=num_shards,
shard_id=shard_id,
num_workers=0,
epoch=epoch,
)
self.iterators.append(epoch_iter)
def __len__(self):
return sum(len(itr) for itr in self.iterators)
def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
# `self.epoch += 1` should be handled by underlying `EpochBatchIterator`s.
return MultiItr(
[
itr.next_epoch_itr(
shuffle=shuffle, fix_batches_to_gpus=fix_batches_to_gpus
)
for itr in self.iterators
]
)
def end_of_epoch(self):
return all(itr.end_of_epoch() for itr in self.iterators)
@property
def next_epoch_idx(self):
"""Return the epoch index after *next_epoch_itr* is called."""
epochs = [itr.next_epoch_idx for itr in self.iterators]
self.epoch = epochs[0]
assert all(epoch == self.epoch for epoch in epochs)
return self.epoch
@property
def iterations_in_epoch(self):
return sum(itr.iterations_in_epoch for itr in self.iterators)
def state_dict(self):
return {
"iterators": [it.state_dict() for it in self.iterators],
"epoch": self.epoch,
}
def load_state_dict(self, state_dict):
self.epoch = state_dict["epoch"]
for it, d in zip(self.iterators, state_dict["iterators"]):
it.load_state_dict(d)
class MultitaskDatasetWrapper(BaseWrapperDataset):
"""A wrapper for a multitask dataset."""
def __init__(self, dataset, target_language_id, sample=1.0, name=""):
super().__init__(dataset)
self.target_language_id = target_language_id
self.sample = sample
self.name = name
def collater(self, *args, **kwargs):
ans = self.dataset.collater(*args, **kwargs)
if "net_input" in ans:
ans["net_input"]["target_language_id"] = self.target_language_id
ans["net_input"]["dataset_name"] = self.name
return ans
def num_tokens(self, *args, **kwargs):
return self.dataset.num_tokens(*args, **kwargs)
def ordered_indices(self, *args, **kwargs):
indices = self.dataset.ordered_indices(*args, **kwargs)
# Hacky solution for sampling
size = int(self.sample * indices.shape[0])
return indices.take(np.sort(np.random.permutation(indices.shape[0])[:size]))
def size(self, index: int):
return self.dataset.size(index)
@property
def supports_prefetch(self):
"""Whether this dataset supports prefetching."""
return getattr(self.dataset, "supports_prefetch", False)
def prefetch(self, indices):
return self.dataset.prefetch(indices)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment