v1.0

72f5785f · huaerkl · 72f5785f · 72f5785f · 72f5785f · 72f5785f
Commit 72f5785f authored Aug 15, 2023 by huaerkl
20 changed files
--- a/examples/hubert/tests/sample.large.L20.npy
+++ b/examples/hubert/tests/sample.large.L20.npy
--- a/examples/hubert/tests/sample.large.hypo.word
+++ b/examples/hubert/tests/sample.large.hypo.word
+KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
--- a/examples/hubert/tests/sample.xlarge.L30.len
+++ b/examples/hubert/tests/sample.xlarge.L30.len
+596
--- a/examples/hubert/tests/sample.xlarge.L30.npy
+++ b/examples/hubert/tests/sample.xlarge.L30.npy
--- a/examples/hubert/tests/sample.xlarge.hypo.word
+++ b/examples/hubert/tests/sample.xlarge.hypo.word
+KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0)
--- a/examples/hubert/tests/test_feature_and_unit.sh
+++ b/examples/hubert/tests/test_feature_and_unit.sh
+#!/bin/bash
+set -e
+sizes="base large xlarge"
+declare -A ckpt_urls
+ckpt_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt"
+ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt"
+ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt"
+declare -A km_layers
+km_layers[base]=9
+km_layers[large]=20
+km_layers[xlarge]=30
+declare -A km_urls
+km_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin"
+declare -A km_nunits
+km_nunits[base]=500
+test_dir=./examples/hubert/tests
+split=sample
+echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv"
+check_feature () {
+  echo "checking features..."
+  size=$1
+  ckpt_url=$2
+  km_layer=$3
+  ckpt_path="$test_dir/$(basename "$ckpt_url")"
+  if [ ! -f "$ckpt_path" ]; then
+    echo "downloading $ckpt_url to $ckpt_path"
+    wget "$ckpt_url" -O "$ckpt_path"
+  fi
+  python ./examples/hubert/simple_kmeans/dump_hubert_feature.py \
+    "${test_dir}" "${split}" "${ckpt_path}" "${km_layer}" 1 0 "${test_dir}"
+  if diff -q "${test_dir}/${split}.${size}.L${km_layer}.npy" "${test_dir}/${split}_0_1.npy" &>/dev/null; then
+    echo "...passed npy check"
+  else
+    echo "...failed npy check"
+  fi
+  if diff -q "${test_dir}/${split}.${size}.L${km_layer}.len" "${test_dir}/${split}_0_1.len" &>/dev/null; then
+    echo "...passed len check"
+  else
+    echo "...failed len check"
+  fi
+}
+check_unit () {
+  echo "checking units..."
+  size=$1
+  km_url=$2
+  km_layer=$3
+  km_nunit=$4
+  km_path="$test_dir/$(basename "$km_url")"
+  if [ ! -f "$km_path" ]; then
+    echo "downloading $km_url to $km_path"
+    wget "$km_url" -O "$km_path"
+  fi
+  python ./examples/hubert/simple_kmeans/dump_km_label.py \
+    "${test_dir}" "${split}" "${km_path}" 1 0 "${test_dir}"
+  if diff -q "${test_dir}/${split}.${size}.L${km_layer}.km${km_nunit}.km" "${test_dir}/${split}_0_1.km" &>/dev/null; then
+    echo "...passed unit check"
+  else
+    echo "...failed unit check"
+  fi
+}
+for size in $sizes; do
+  echo "=== Running unit test for HuBERT $size ==="
+  check_feature "$size" "${ckpt_urls[$size]}" "${km_layers[$size]}"
+  if [ -n "${km_urls[$size]}" ]; then
+    check_unit "$size" "${km_urls[$size]}" "${km_layers[$size]}" "${km_nunits[$size]}"
+  fi
+  rm -f $test_dir/${split}_0_1.*
+done
--- a/examples/hubert/tests/test_finetuned_asr.sh
+++ b/examples/hubert/tests/test_finetuned_asr.sh
+#!/bin/bash
+set -e
+sizes="large xlarge"
+declare -A ckpt_urls
+ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt"
+ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt"
+test_dir=$(pwd)/examples/hubert/tests
+split=sample
+echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv"
+echo -e "K E E P | A | G O I N G | A N D | I F | Y O U ' R E | L U C K Y | Y O U ' L L | R U N | P L U M B | I N T O | T H E M | W A S | T H E | J E E R I N G | A N S W E R | A S | T H E | S L E E P Y | C O W M E N | S P U R R E D | T H E I R | P O N I E S | O N | T O W A R D | C A M P | M U T T E R I N G | T H E I R | D I S A P P R O V A L | O F | T A K I N G | A L O N G | A | B U N C H | O F | B O Y S | O N | A | C A T T L E | D R I V E |" > "${test_dir}/${split}.ltr"
+check_asr () {
+  echo "checking asr outputs..."
+  size=$1
+  ckpt_url=$2
+  ckpt_path="$test_dir/$(basename "$ckpt_url")"
+  if [ ! -f "$ckpt_path" ]; then
+    echo "downloading $ckpt_url to $ckpt_path"
+    wget "$ckpt_url" -O "$ckpt_path"
+  fi
+  python examples/speech_recognition/new/infer.py \
+    --config-dir examples/hubert/config/decode --config-name infer_viterbi \
+    common_eval.path="${ckpt_path}" task.data="${test_dir}" task.normalize=true \
+    decoding.results_path="${test_dir}/pred" \
+    common_eval.results_path="${test_dir}/pred" \
+    common_eval.quiet=false dataset.gen_subset="${split}"
+  if diff -q "${test_dir}/pred/hypo.word" "${test_dir}/${split}.${size}.hypo.word" &>/dev/null; then
+    echo "...passed word check"
+  else
+    echo "...failed word check"
+  fi
+  rm -rf "${test_dir}/pred"
+}
+for size in $sizes; do
+  check_asr "$size" "${ckpt_urls[$size]}"
+done
--- a/examples/hubert/update_ckpt.py
+++ b/examples/hubert/update_ckpt.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
+ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
+new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
+def update_state(state):
+    state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
+    state["args"].task = "hubert_pretraining"
+    state["args"].labels = f"['{state['args'].labels}']"
+    return state
+src_state = torch.load(src_ckpt)
+src_state = update_state(src_state)
+torch.save(src_state, new_ckpt)
--- a/examples/joint_alignment_translation/README.md
+++ b/examples/joint_alignment_translation/README.md
+# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
+This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
+## Training a joint alignment-translation model on WMT'18 En-De
+##### 1. Extract and preprocess the WMT'18 En-De data
+```bash
+./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
+```
+##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
+In this example, we use FastAlign.
+```bash
+git clone git@github.com:clab/fast_align.git
+pushd fast_align
+mkdir build
+cd build
+cmake ..
+make
+popd
+ALIGN=fast_align/build/fast_align
+paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
+$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
+```
+##### 3. Preprocess the dataset with the above generated alignments.
+```bash
+fairseq-preprocess \
+    --source-lang en --target-lang de \
+    --trainpref bpe.32k/train \
+    --validpref bpe.32k/valid \
+    --testpref bpe.32k/test \
+    --align-suffix align \
+    --destdir binarized/ \
+    --joined-dictionary \
+    --workers 32
+```
+##### 4. Train a model
+```bash
+fairseq-train \
+    binarized \
+    --arch transformer_wmt_en_de_big_align --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
+    --lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+    --max-tokens 3500 --label-smoothing 0.1 \
+    --save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
+    --keep-interval-updates -1 --save-interval-updates 0 \
+    --load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
+    --fp16
+```
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
+If you want to train the above model with big batches (assuming your machine has 8 GPUs):
+- add `--update-freq 8` to simulate training on 8x8=64 GPUs
+- increase the learning rate; 0.0007 works well for big batches
+##### 5. Evaluate and generate the alignments (BPE level)
+```bash
+fairseq-generate \
+    binarized --gen-subset test --print-alignment \
+    --source-lang en --target-lang de \
+    --path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
+```
+##### 6. Other resources.
+The code for:
+1. preparing alignment test sets
+2. converting BPE level alignments to token level alignments
+3. symmetrizing bidirectional alignments
+4. evaluating alignments using AER metric
+can be found [here](https://github.com/lilt/alignment-scripts)
+## Citation
+```bibtex
+@inproceedings{garg2019jointly,
+  title = {Jointly Learning to Align and Translate with Transformer Models},
+  author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
+  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  address = {Hong Kong},
+  month = {November},
+  url = {https://arxiv.org/abs/1909.02074},
+  year = {2019},
+}
+```
--- a/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
+++ b/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
+    "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
+    "http://data.statmt.org/wmt17/translation-task/dev.tgz"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.de-en"
+    "commoncrawl.de-en"
+    "training-parallel-nc-v13/news-commentary-v13.de-en"
+    "rapid2016.de-en"
+)
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+src=en
+tgt=de
+lang=en-de
+prep=wmt18_en_de
+tmp=$prep/tmp
+orig=orig
+dev=dev/newstest2012
+codes=32000
+bpe=bpe.32k
+mkdir -p $orig $tmp $prep $bpe
+cd $orig
+for ((i=0;i<${#URLS[@]};++i)); do
+    url=${URLS[i]}
+    file=$(basename $url)
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit 1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+cd ..
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
+    echo ""
+done
+# apply length filtering before BPE
+perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
+# use newstest2012 for valid
+echo "pre-processing valid data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/valid.$l
+    cat $orig/$dev.$l | \
+        perl $REM_NON_PRINT_CHAR | \
+        perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
+done
+mkdir output
+mv $tmp/{train,valid,test}.{$src,$tgt} output
+#BPE
+git clone https://github.com/glample/fastBPE.git
+pushd fastBPE
+g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+popd
+fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
+for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done
--- a/examples/language_model/README.adaptive_inputs.md
+++ b/examples/language_model/README.adaptive_inputs.md
+# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
+## Pre-trained models
+Description | Parameters | Dataset | Model and Test set(s)
+---|---:|---|---
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+## Training an LM with adaptive inputs
+First, see the general [language modeling README](README.md) for instructions on
+preprocessing the WikiText-103 data.
+Then use the following training command to train a model with adaptive inputs
+using the `transformer_lm_wiki103` model architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --arch transformer_lm_wiki103 \
+    --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
+    --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
+    --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
+```
+## Citation
+```bibtex
+@inproceedings{
+    baevski2018adaptive,
+    title={Adaptive Input Representations for Neural Language Modeling},
+    author={Alexei Baevski and Michael Auli},
+    booktitle={International Conference on Learning Representations},
+    year={2019},
+    url={https://openreview.net/forum?id=ByxZX20qFQ},
+}
+```
--- a/examples/language_model/README.conv.md
+++ b/examples/language_model/README.conv.md
+# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
+## Example usage
+First download and preprocess the data following the main [language modeling README](README.md).
+Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
+architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/fconv_wikitext-103 \
+    --arch fconv_lm_dauphin_wikitext103 \
+    --adaptive-softmax-cutoff 10000,20000,200000 \
+    --dropout 0.2 \
+    --criterion adaptive_loss \
+    --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
+    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+    --max-tokens 1024 --tokens-per-sample 1024 \
+    --ddp-backend legacy_ddp \
+    --max-epoch 35
+```
+And evaluate with:
+```bash
+fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
+```
+## Citation
+```bibtex
+@inproceedings{dauphin2017language,
+  title={Language Modeling with Gated Convolutional Networks},
+  author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
+  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+  pages={933--941},
+  year={2017},
+  organization={JMLR}
+}
+```
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
+# Neural Language Modeling
+## Pre-trained models
+Model | Description | Dataset | Download
+---|---|---|---
+`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
+## Example usage
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+To sample from a language model using PyTorch Hub:
+```python
+import torch
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer_lm.wmt19.en', ...]
+# Load an English LM trained on WMT'19 News Crawl data
+en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+en_lm.eval()  # disable dropout
+# Move model to GPU
+en_lm.cuda()
+# Sample from the language model
+en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
+# "Barack Obama is coming to Sydney and New Zealand (...)"
+# Compute perplexity for a sequence
+en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
+# tensor(15.1474)
+# The same interface can be used with custom models as well
+from fairseq.models.transformer_lm import TransformerLanguageModel
+custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
+custom_lm.sample('Barack Obama', beam=5)
+# "Barack Obama (...)"
+```
+## Training a transformer language model with the CLI tools
+### 1) Preprocess the data
+First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
+```bash
+cd examples/language_model/
+bash prepare-wikitext-103.sh
+cd ../..
+```
+Next preprocess/binarize the data:
+```bash
+TEXT=examples/language_model/wikitext-103
+fairseq-preprocess \
+    --only-source \
+    --trainpref $TEXT/wiki.train.tokens \
+    --validpref $TEXT/wiki.valid.tokens \
+    --testpref $TEXT/wiki.test.tokens \
+    --destdir data-bin/wikitext-103 \
+    --workers 20
+```
+### 2) Train a language model
+Next we'll train a basic transformer language model on wikitext-103. For more
+advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
+To train a basic LM (assumes 2 GPUs):
+```
+$ fairseq-train --task language_modeling \
+  data-bin/wikitext-103 \
+  --save-dir checkpoints/transformer_wikitext-103 \
+  --arch transformer_lm --share-decoder-input-output-embed \
+  --dropout 0.1 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
+  --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+  --tokens-per-sample 512 --sample-break-mode none \
+  --max-tokens 2048 --update-freq 16 \
+  --fp16 \
+  --max-update 50000
+```
+If you run out of memory, try reducing `--max-tokens` (max number of tokens per
+batch) or `--tokens-per-sample` (max sequence length). You can also adjust
+`--update-freq` to accumulate gradients and simulate training on a different
+number of GPUs.
+### 3) Evaluate
+```bash
+fairseq-eval-lm data-bin/wikitext-103 \
+    --path checkpoints/transformer_wiki103/checkpoint_best.pt \
+    --batch-size 2 \
+    --tokens-per-sample 512 \
+    --context-window 400
+# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
+# | Loss: 3.4164, Perplexity: 30.46
+```
+*Note:* The `--context-window` option controls how much context is provided to
+each token when computing perplexity. When the window size is 0, the dataset is
+chunked into segments of length 512 and perplexity is computed over each segment
+normally. However, this results in worse (higher) perplexity since tokens that
+appear earlier in each segment have less conditioning. When the maximum window
+size is used (511 in this case), then we compute perplexity for each token
+fully conditioned on 511 tokens of context. This slows down evaluation
+significantly, since we must run a separate forward pass for every token in the
+dataset, but results in better (lower) perplexity.
+## Convolutional language models
+Please see the [convolutional LM README](README.conv.md) for instructions on
+training convolutional language models.
--- a/examples/language_model/prepare-wikitext-103.sh
+++ b/examples/language_model/prepare-wikitext-103.sh
+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+URLS=(
+    "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
+)
+FILES=(
+    "wikitext-103-v1.zip"
+)
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        elif [ ${file: -4} == ".zip" ]; then
+            unzip $file
+        fi
+    fi
+done
+cd ..
--- a/examples/laser/README.md
+++ b/examples/laser/README.md
+# LASER  Language-Agnostic SEntence Representations
+LASER is a library to calculate and use multilingual sentence embeddings.
+You can find more information about LASER and how to use it on the official [LASER repository](https://github.com/facebookresearch/LASER).
+This folder contains source code for training LASER embeddings.
+## Prepare data and configuration file
+Binarize your data with fairseq, as described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing).
+Create a json config file with this format:
+```
+{
+  "src_vocab": "/path/to/spm.src.cvocab",
+  "tgt_vocab": "/path/to/spm.tgt.cvocab",
+  "train": [
+    {
+      "type": "translation",
+      "id": 0,
+      "src": "/path/to/srclang1-tgtlang0/train.srclang1",
+      "tgt": "/path/to/srclang1-tgtlang0/train.tgtlang0"
+    },
+    {
+      "type": "translation",
+      "id": 1,
+      "src": "/path/to/srclang1-tgtlang1/train.srclang1",
+      "tgt": "/path/to/srclang1-tgtlang1/train.tgtlang1"
+    },
+    {
+      "type": "translation",
+      "id": 0,
+      "src": "/path/to/srclang2-tgtlang0/train.srclang2",
+      "tgt": "/path/to/srclang2-tgtlang0/train.tgtlang0"
+    },
+    {
+      "type": "translation",
+      "id": 1,
+      "src": "/path/to/srclang2-tgtlang1/train.srclang2",
+      "tgt": "/path/to/srclang2-tgtlang1/train.tgtlang1"
+    },
+    ...
+  ],
+  "valid": [
+    {
+      "type": "translation",
+      "id": 0,
+      "src": "/unused",
+      "tgt": "/unused"
+    }
+  ]
+}
+```
+where paths are paths to binarized indexed fairseq dataset files.
+`id` represents the target language id.
+## Training Command Line Example
+```
+fairseq-train \
+  /path/to/configfile_described_above.json \
+  --user-dir examples/laser/laser_src \
+  --log-interval 100 --log-format simple \
+  --task laser --arch laser_lstm \
+  --save-dir . \
+  --optimizer adam \
+  --lr 0.001 \
+  --lr-scheduler inverse_sqrt \
+  --clip-norm 5 \
+  --warmup-updates 90000 \
+  --update-freq 2 \
+  --dropout 0.0 \
+  --encoder-dropout-out 0.1 \
+  --max-tokens 2000 \
+  --max-epoch 50 \
+  --encoder-bidirectional \
+  --encoder-layers 5 \
+  --encoder-hidden-size 512 \
+  --decoder-layers 1 \
+  --decoder-hidden-size 2048 \
+  --encoder-embed-dim 320 \
+  --decoder-embed-dim 320 \
+  --decoder-lang-embed-dim 32 \
+  --warmup-init-lr 0.001 \
+  --disable-validation
+```
+## Applications
+We showcase several applications of multilingual sentence embeddings
+with code to reproduce our results (in the directory "tasks").
+* [**Cross-lingual document classification**](https://github.com/facebookresearch/LASER/tree/master/tasks/mldoc) using the
+  [*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6]
+* [**WikiMatrix**](https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix)
+   Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7]
+* [**Bitext mining**](https://github.com/facebookresearch/LASER/tree/master/tasks/bucc) using the
+  [*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5]
+* [**Cross-lingual NLI**](https://github.com/facebookresearch/LASER/tree/master/tasks/xnli)
+  using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6]
+* [**Multilingual similarity search**](https://github.com/facebookresearch/LASER/tree/master/tasks/similarity) [1,6]
+* [**Sentence embedding of text files**](https://github.com/facebookresearch/LASER/tree/master/tasks/embed)
+  example how to calculate sentence embeddings for arbitrary text files in any of the supported language.
+**For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.**
+## References
+[1] Holger Schwenk and Matthijs Douze,
+    [*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619),
+    ACL workshop on Representation Learning for NLP, 2017
+[2] Holger Schwenk and Xian Li,
+    [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf),
+    LREC, pages 3548-3551, 2018.
+[3] Holger Schwenk,
+    [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037)
+    ACL, July 2018
+[4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov,
+    [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269),
+    EMNLP, 2018.
+[5] Mikel Artetxe and Holger Schwenk,
+    [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
+    arXiv, Nov 3 2018.
+[6] Mikel Artetxe and Holger Schwenk,
+    [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
+    arXiv, Dec 26 2018.
+[7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
+    [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
+    arXiv, July 11  2019.
+[8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin
+    [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
--- a/examples/laser/laser_src/__init__.py
+++ b/examples/laser/laser_src/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .laser_task import *  # noqa
+from .laser_lstm import *  # noqa
+from .laser_transformer import *  # noqa
--- a/examples/laser/laser_src/laser_lstm.py
+++ b/examples/laser/laser_src/laser_lstm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+@register_model("laser_lstm")
+class LSTMModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens=None,
+        tgt_tokens=None,
+        tgt_lengths=None,
+        target_language_id=None,
+        dataset_name="",
+    ):
+        assert target_language_id is not None
+        src_encoder_out = self.encoder(src_tokens, src_lengths, dataset_name)
+        return self.decoder(
+            prev_output_tokens, src_encoder_out, lang_id=target_language_id
+        )
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--dropout",
+            default=0.1,
+            type=float,
+            metavar="D",
+            help="dropout probability",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-embed-path",
+            default=None,
+            type=str,
+            metavar="STR",
+            help="path to pre-trained encoder embedding",
+        )
+        parser.add_argument(
+            "--encoder-hidden-size", type=int, metavar="N", help="encoder hidden size"
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="number of encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-bidirectional",
+            action="store_true",
+            help="make all layers of encoder bidirectional",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-embed-path",
+            default=None,
+            type=str,
+            metavar="STR",
+            help="path to pre-trained decoder embedding",
+        )
+        parser.add_argument(
+            "--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size"
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="number of decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-out-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder output embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-zero-init",
+            type=str,
+            metavar="BOOL",
+            help="initialize the decoder hidden/cell state to zero",
+        )
+        parser.add_argument(
+            "--decoder-lang-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder language embedding dimension",
+        )
+        parser.add_argument(
+            "--fixed-embeddings",
+            action="store_true",
+            help="keep embeddings fixed (ENCODER ONLY)",
+        )  # TODO Also apply to decoder embeddings?
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument(
+            "--encoder-dropout-in",
+            type=float,
+            metavar="D",
+            help="dropout probability for encoder input embedding",
+        )
+        parser.add_argument(
+            "--encoder-dropout-out",
+            type=float,
+            metavar="D",
+            help="dropout probability for encoder output",
+        )
+        parser.add_argument(
+            "--decoder-dropout-in",
+            type=float,
+            metavar="D",
+            help="dropout probability for decoder input embedding",
+        )
+        parser.add_argument(
+            "--decoder-dropout-out",
+            type=float,
+            metavar="D",
+            help="dropout probability for decoder output",
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+        pretrained_encoder_embed = None
+        if args.encoder_embed_path:
+            pretrained_encoder_embed = load_pretrained_embedding_from_file(
+                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
+            )
+        pretrained_decoder_embed = None
+        if args.decoder_embed_path:
+            pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim
+            )
+        num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
+        encoder = LSTMEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            hidden_size=args.encoder_hidden_size,
+            num_layers=args.encoder_layers,
+            dropout_in=args.encoder_dropout_in,
+            dropout_out=args.encoder_dropout_out,
+            bidirectional=args.encoder_bidirectional,
+            pretrained_embed=pretrained_encoder_embed,
+            fixed_embeddings=args.fixed_embeddings,
+        )
+        decoder = LSTMDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            zero_init=options.eval_bool(args.decoder_zero_init),
+            encoder_embed_dim=args.encoder_embed_dim,
+            encoder_output_units=encoder.output_units,
+            pretrained_embed=pretrained_decoder_embed,
+            num_langs=num_langs,
+            lang_embed_dim=args.decoder_lang_embed_dim,
+        )
+        return cls(encoder, decoder)
+class LSTMEncoder(FairseqEncoder):
+    """LSTM encoder."""
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        hidden_size=512,
+        num_layers=1,
+        dropout_in=0.1,
+        dropout_out=0.1,
+        bidirectional=False,
+        left_pad=True,
+        pretrained_embed=None,
+        padding_value=0.0,
+        fixed_embeddings=False,
+    ):
+        super().__init__(dictionary)
+        self.num_layers = num_layers
+        self.dropout_in = dropout_in
+        self.dropout_out = dropout_out
+        self.bidirectional = bidirectional
+        self.hidden_size = hidden_size
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+        if fixed_embeddings:
+            self.embed_tokens.weight.requires_grad = False
+        self.lstm = LSTM(
+            input_size=embed_dim,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=self.dropout_out if num_layers > 1 else 0.0,
+            bidirectional=bidirectional,
+        )
+        self.left_pad = left_pad
+        self.padding_value = padding_value
+        self.output_units = hidden_size
+        if bidirectional:
+            self.output_units *= 2
+    def forward(self, src_tokens, src_lengths, dataset_name):
+        if self.left_pad:
+            # convert left-padding to right-padding
+            src_tokens = utils.convert_padding_direction(
+                src_tokens,
+                self.padding_idx,
+                left_to_right=True,
+            )
+        bsz, seqlen = src_tokens.size()
+        # embed tokens
+        x = self.embed_tokens(src_tokens)
+        x = F.dropout(x, p=self.dropout_in, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        # pack embedded source tokens into a PackedSequence
+        try:
+            packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
+        except BaseException:
+            raise Exception(f"Packing failed in dataset {dataset_name}")
+        # apply LSTM
+        if self.bidirectional:
+            state_size = 2 * self.num_layers, bsz, self.hidden_size
+        else:
+            state_size = self.num_layers, bsz, self.hidden_size
+        h0 = x.data.new(*state_size).zero_()
+        c0 = x.data.new(*state_size).zero_()
+        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
+        # unpack outputs and apply dropout
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            packed_outs, padding_value=self.padding_value
+        )
+        x = F.dropout(x, p=self.dropout_out, training=self.training)
+        assert list(x.size()) == [seqlen, bsz, self.output_units]
+        if self.bidirectional:
+            def combine_bidir(outs):
+                return torch.cat(
+                    [
+                        torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
+                            1, bsz, self.output_units
+                        )
+                        for i in range(self.num_layers)
+                    ],
+                    dim=0,
+                )
+            final_hiddens = combine_bidir(final_hiddens)
+            final_cells = combine_bidir(final_cells)
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
+        # Set padded outputs to -inf so they are not selected by max-pooling
+        padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
+        if padding_mask.any():
+            x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
+        # Build the sentence embedding by max-pooling over the encoder outputs
+        sentemb = x.max(dim=0)[0]
+        return {
+            "sentemb": sentemb,
+            "encoder_out": (x, final_hiddens, final_cells),
+            "encoder_padding_mask": encoder_padding_mask
+            if encoder_padding_mask.any()
+            else None,
+        }
+    def reorder_encoder_out(self, encoder_out_dict, new_order):
+        encoder_out_dict["sentemb"] = encoder_out_dict["sentemb"].index_select(
+            0, new_order
+        )
+        encoder_out_dict["encoder_out"] = tuple(
+            eo.index_select(1, new_order) for eo in encoder_out_dict["encoder_out"]
+        )
+        if encoder_out_dict["encoder_padding_mask"] is not None:
+            encoder_out_dict["encoder_padding_mask"] = encoder_out_dict[
+                "encoder_padding_mask"
+            ].index_select(1, new_order)
+        return encoder_out_dict
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return int(1e5)  # an arbitrary large number
+class LSTMDecoder(FairseqIncrementalDecoder):
+    """LSTM decoder."""
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        hidden_size=512,
+        out_embed_dim=512,
+        num_layers=1,
+        dropout_in=0.1,
+        dropout_out=0.1,
+        zero_init=False,
+        encoder_embed_dim=512,
+        encoder_output_units=512,
+        pretrained_embed=None,
+        num_langs=1,
+        lang_embed_dim=0,
+    ):
+        super().__init__(dictionary)
+        self.dropout_in = dropout_in
+        self.dropout_out = dropout_out
+        self.hidden_size = hidden_size
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+        self.layers = nn.ModuleList(
+            [
+                LSTMCell(
+                    input_size=encoder_output_units + embed_dim + lang_embed_dim
+                    if layer == 0
+                    else hidden_size,
+                    hidden_size=hidden_size,
+                )
+                for layer in range(num_layers)
+            ]
+        )
+        if hidden_size != out_embed_dim:
+            self.additional_fc = Linear(hidden_size, out_embed_dim)
+        self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
+        if zero_init:
+            self.sentemb2init = None
+        else:
+            self.sentemb2init = Linear(
+                encoder_output_units, 2 * num_layers * hidden_size
+            )
+        if lang_embed_dim == 0:
+            self.embed_lang = None
+        else:
+            self.embed_lang = nn.Embedding(num_langs, lang_embed_dim)
+            nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1)
+    def forward(
+        self, prev_output_tokens, encoder_out_dict, incremental_state=None, lang_id=0
+    ):
+        sentemb = encoder_out_dict["sentemb"]
+        encoder_out = encoder_out_dict["encoder_out"]
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        bsz, seqlen = prev_output_tokens.size()
+        # get outputs from encoder
+        encoder_outs, _, _ = encoder_out[:3]
+        srclen = encoder_outs.size(0)
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+        x = F.dropout(x, p=self.dropout_in, training=self.training)
+        # embed language identifier
+        if self.embed_lang is not None:
+            lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id)
+            langemb = self.embed_lang(lang_ids)
+            # TODO Should we dropout here???
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        # initialize previous states (or get from cache during incremental generation)
+        cached_state = utils.get_incremental_state(
+            self, incremental_state, "cached_state"
+        )
+        if cached_state is not None:
+            prev_hiddens, prev_cells, input_feed = cached_state
+        else:
+            num_layers = len(self.layers)
+            if self.sentemb2init is None:
+                prev_hiddens = [
+                    x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers)
+                ]
+                prev_cells = [
+                    x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers)
+                ]
+            else:
+                init = self.sentemb2init(sentemb)
+                prev_hiddens = [
+                    init[:, (2 * i) * self.hidden_size : (2 * i + 1) * self.hidden_size]
+                    for i in range(num_layers)
+                ]
+                prev_cells = [
+                    init[
+                        :,
+                        (2 * i + 1) * self.hidden_size : (2 * i + 2) * self.hidden_size,
+                    ]
+                    for i in range(num_layers)
+                ]
+            input_feed = x.data.new(bsz, self.hidden_size).zero_()
+        attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
+        outs = []
+        for j in range(seqlen):
+            if self.embed_lang is None:
+                input = torch.cat((x[j, :, :], sentemb), dim=1)
+            else:
+                input = torch.cat((x[j, :, :], sentemb, langemb), dim=1)
+            for i, rnn in enumerate(self.layers):
+                # recurrent cell
+                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
+                # hidden state becomes the input to the next layer
+                input = F.dropout(hidden, p=self.dropout_out, training=self.training)
+                # save state for next time step
+                prev_hiddens[i] = hidden
+                prev_cells[i] = cell
+            out = hidden
+            out = F.dropout(out, p=self.dropout_out, training=self.training)
+            # input feeding
+            input_feed = out
+            # save final output
+            outs.append(out)
+        # cache previous states (no-op except during incremental generation)
+        utils.set_incremental_state(
+            self,
+            incremental_state,
+            "cached_state",
+            (prev_hiddens, prev_cells, input_feed),
+        )
+        # collect outputs across time steps
+        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
+        attn_scores = attn_scores.transpose(0, 2)
+        # project back to size of vocabulary
+        if hasattr(self, "additional_fc"):
+            x = self.additional_fc(x)
+            x = F.dropout(x, p=self.dropout_out, training=self.training)
+        x = self.fc_out(x)
+        return x, attn_scores
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        cached_state = utils.get_incremental_state(
+            self, incremental_state, "cached_state"
+        )
+        if cached_state is None:
+            return
+        def reorder_state(state):
+            if isinstance(state, list):
+                return [reorder_state(state_i) for state_i in state]
+            return state.index_select(0, new_order)
+        new_state = tuple(map(reorder_state, cached_state))
+        utils.set_incremental_state(self, incremental_state, "cached_state", new_state)
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return int(1e5)  # an arbitrary large number
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.uniform_(m.weight, -0.1, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+def LSTM(input_size, hidden_size, **kwargs):
+    m = nn.LSTM(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if "weight" in name or "bias" in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+def LSTMCell(input_size, hidden_size, **kwargs):
+    m = nn.LSTMCell(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if "weight" in name or "bias" in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+def Linear(in_features, out_features, bias=True, dropout=0):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.uniform_(-0.1, 0.1)
+    if bias:
+        m.bias.data.uniform_(-0.1, 0.1)
+    return m
+@register_model_architecture("laser_lstm", "laser_lstm")
+def base_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_hidden_size = getattr(
+        args, "encoder_hidden_size", args.encoder_embed_dim
+    )
+    args.encoder_layers = getattr(args, "encoder_layers", 1)
+    args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False)
+    args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout)
+    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_hidden_size = getattr(
+        args, "decoder_hidden_size", args.decoder_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 1)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
+    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
+    args.decoder_zero_init = getattr(args, "decoder_zero_init", "0")
+    args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0)
+    args.fixed_embeddings = getattr(args, "fixed_embeddings", False)
--- a/examples/laser/laser_src/laser_task.py
+++ b/examples/laser/laser_src/laser_task.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict, defaultdict
+import json
+import os
+import logging
+from argparse import ArgumentError
+from fairseq import options, models
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    LanguagePairDataset,
+    IndexedDataset,
+    FairseqDataset,
+)
+from .multitask_data_utils import (
+    MultitaskDatasetWrapper,
+    MultidatasetEpochBatchIterator,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+logger = logging.getLogger(__name__)
+@register_task("laser")
+class LaserTask(LegacyFairseqTask):
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "configfile", metavar="PATH", help="dataset configuration file in json"
+        )
+        parser.add_argument(
+            "--weighting-alpha",
+            type=float,
+            default=None,
+            help="alpha for automatic weighting",
+        )
+        parser.add_argument(
+            "--raw-text", action="store_true", help="load raw text dataset"
+        )
+        parser.add_argument(
+            "--left-pad-source",
+            default="True",
+            type=str,
+            metavar="BOOL",
+            help="pad the source on the left (default: True)",
+        )
+        parser.add_argument(
+            "--left-pad-target",
+            default="False",
+            type=str,
+            metavar="BOOL",
+            help="pad the target on the left (default: False)",
+        )
+        try:
+            parser.add_argument(
+                "--max-source-positions",
+                default=1024,
+                type=int,
+                metavar="N",
+                help="max number of tokens in the source sequence",
+            )
+            parser.add_argument(
+                "--max-target-positions",
+                default=1024,
+                type=int,
+                metavar="N",
+                help="max number of tokens in the target sequence",
+            )
+        except ArgumentError:
+            # this might have already been defined. Once we transition this to hydra it should be fine to add it here.
+            pass
+    def __init__(self, args, config, src_dictionary, tgt_dictionary, num_tasks):
+        super().__init__(args)
+        self.config = config
+        self.src_dictionary = src_dictionary
+        self.tgt_dictionary = tgt_dictionary
+        self.num_tasks = num_tasks
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        with open(args.configfile, "r") as f:
+            config = json.load(f)
+        num_tasks = max(dataset["id"] for dataset in config["train"]) + 1
+        args.left_pad_source = options.eval_bool(args.left_pad_source)
+        args.left_pad_target = options.eval_bool(args.left_pad_target)
+        src_dictionary = Dictionary.load(config["src_vocab"])
+        tgt_dictionary = Dictionary.load(config["tgt_vocab"])
+        logger.info(
+            "| src Dictionary {} : {} types".format(
+                config["src_vocab"], len(src_dictionary)
+            )
+        )
+        logger.info(
+            "| tgt Dictionary {} : {} types".format(
+                config["tgt_vocab"], len(tgt_dictionary)
+            )
+        )
+        return cls(args, config, src_dictionary, tgt_dictionary, num_tasks)
+    # Experimental overriding for backtranslation
+    def build_model(self, args, from_checkpoint=False):
+        model = models.build_model(args, self)
+        return model
+    def dataset(self, split):
+        if split not in self.datasets:
+            raise KeyError("Dataset not loaded: " + split)
+        return self.datasets[split]
+    def load_dataset(self, split, epoch=1, **kwargs):
+        """Load a dataset split."""
+        def indexed_dataset(path, dictionary):
+            if self.args.raw_text:
+                raise Exception("Unable to handle raw text.")
+            dataset = IndexedDataset(path, fix_lua_indexing=True)
+            return dataset
+        pair_datasets = OrderedDict()
+        if split == "valid":
+            self.datasets[split] = pair_datasets
+            return
+        if split not in self.config:
+            raise FileNotFoundError(
+                "Dataset not found in config file: {}".format(split)
+            )
+        size_by_corpus = defaultdict(int)
+        size_sum = 0
+        size_sum_with_subsampling = 0
+        init_pair_datasets = {}
+        for dataset_config in self.config[split]:
+            src_path = os.path.dirname(dataset_config["src"])
+            corpus_name = src_path.split("/")[-2]
+            language_pair_name = src_path.split("/")[-1]
+            pair_datasets_key = corpus_name + "-" + language_pair_name
+            logger.info(f"loading... {pair_datasets_key}")
+            if "src" in dataset_config:
+                src_dataset = indexed_dataset(
+                    dataset_config["src"], self.src_dictionary
+                )
+            else:
+                src_dataset = None
+            if "tgt" in dataset_config:
+                tgt_dataset = indexed_dataset(
+                    dataset_config["tgt"], self.tgt_dictionary
+                )
+            else:
+                tgt_dataset = None
+            dataset = LanguagePairDataset(
+                src_dataset,
+                src_dataset.sizes,
+                self.src_dictionary,
+                tgt_dataset,
+                tgt_dataset.sizes,
+                self.tgt_dictionary,
+                left_pad_source=self.args.left_pad_source,
+                left_pad_target=self.args.left_pad_target,
+            )
+            if pair_datasets_key in init_pair_datasets:
+                logger.warning(
+                    f"Ignoring already added {pair_datasets_key}. "
+                    f"Consider using `sample` key in order to upsample."
+                )
+            else:
+                init_pair_datasets[pair_datasets_key] = {
+                    "dataset": dataset,
+                    "sample": dataset_config.get("sample", None),
+                    "id": dataset_config.get("id", None),
+                    "len": len(dataset),
+                }
+        length_sum = 0
+        weighted_freqs_sum = 0
+        freq_per_dataset = {}
+        vmax = 0
+        vmin = 1
+        weighted_freq_per_dataset = {}
+        if self.args.weighting_alpha:
+            for key in init_pair_datasets:
+                if init_pair_datasets[key]["sample"] is None:
+                    length_sum += len(init_pair_datasets[key]["dataset"])
+            for key in init_pair_datasets:
+                if init_pair_datasets[key]["sample"] is None:
+                    val = float(init_pair_datasets[key]["len"]) / length_sum
+                    freq_per_dataset[key] = val
+                    weighted_freqs_sum += val ** self.args.weighting_alpha
+            for key in freq_per_dataset:
+                val = (
+                    freq_per_dataset[key] ** self.args.weighting_alpha
+                    / weighted_freqs_sum
+                )
+                vmin = min(vmin, val)
+                vmax = max(vmax, val)
+                weighted_freq_per_dataset[key] = val
+        for pair_datasets_key in init_pair_datasets:
+            dataset_config = init_pair_datasets[pair_datasets_key]
+            dataset = dataset_config["dataset"]
+            sample = dataset_config["sample"]
+            if sample is None:
+                sample = 1.0
+            if pair_datasets_key in weighted_freq_per_dataset:
+                w = vmax / weighted_freq_per_dataset[pair_datasets_key]
+                sample = w
+            sample = round(sample)
+            initial_sample = sample
+            initial_pair_datasets_key = pair_datasets_key
+            while sample >= 1.0:
+                assert (
+                    pair_datasets_key not in pair_datasets
+                ), f"{pair_datasets_key} already in"
+                size_sum_with_subsampling += len(dataset)
+                pair_datasets[pair_datasets_key] = MultitaskDatasetWrapper(
+                    dataset, dataset_config.get("id", 0), 1.0, name=pair_datasets_key
+                )
+                size_sum += len(dataset)
+                sample -= 1.0
+                pair_datasets_key += "-up"
+            assert sample < 1e-6, f"sample remains > 0 {pair_datasets_key}"
+            logger.info(
+                f"added pair {initial_pair_datasets_key} length {len(dataset)} new_length = {len(dataset)*initial_sample}"
+            )
+            size_by_corpus[corpus_name] += len(dataset)
+        self.datasets[split] = pair_datasets
+        logger.info(
+            f"Datasets number = {len(self.datasets[split])} size = {size_sum} size_sum_with_subsampling = {size_sum_with_subsampling}"
+        )
+    @property
+    def source_dictionary(self):
+        return self.src_dictionary
+    @property
+    def target_dictionary(self):
+        return self.tgt_dictionary
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+        grouped_shuffling=False,
+        update_epoch_batch_itr=False,
+        **kwargs,
+    ):
+        assert isinstance(dataset, OrderedDict)
+        assert len(dataset)
+        assert isinstance(dataset[next(iter(dataset))], FairseqDataset)
+        # initialize the dataset with the correct starting epoch
+        for _, dt in dataset.items():
+            dt.set_epoch(epoch)
+        indices = OrderedDict()
+        batch_sampler = OrderedDict()
+        with data_utils.numpy_seed(seed + epoch):
+            for key, dt in dataset.items():
+                logger.info(f"\t ordered_indices {key}")
+                indices[key] = dt.ordered_indices()
+        # filter examples that are too large
+        if max_positions is not None:
+            for key, dt in dataset.items():
+                logger.info(f"\t filter_by_size {key}")
+                indices[key], ignored = dt.filter_indices_by_size(
+                    indices[key], max_positions
+                )
+        for key, dt in dataset.items():
+            logger.info(f"\t batch_by_size {key}")
+            batch_sampler[key] = data_utils.batch_by_size(
+                indices[key],
+                dt.num_tokens,
+                max_tokens=max_tokens,
+                max_sentences=max_sentences,
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+        epoch_iter = MultidatasetEpochBatchIterator(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+        )
+        return epoch_iter
--- a/examples/laser/laser_src/laser_transformer.py
+++ b/examples/laser/laser_src/laser_transformer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Any, Dict, List, Optional
+from torch import Tensor
+import torch
+import torch.nn as nn
+from fairseq.models import (
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    base_architecture,
+    Embedding,
+    TransformerModel,
+    TransformerEncoder,
+    TransformerDecoder,
+)
+from fairseq.modules import (
+    TransformerDecoderLayer,
+)
+logger = logging.getLogger(__name__)
+@register_model("laser_transformer")
+class LaserTransformerModel(FairseqEncoderDecoderModel):
+    """Train Transformer for LASER task
+    Requires --task laser
+    """
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens=None,
+        tgt_tokens=None,
+        tgt_lengths=None,
+        target_language_id=-1,
+        dataset_name="",
+    ):
+        laser_encoder_out = self.encoder(src_tokens, src_lengths)
+        return self.decoder(
+            prev_output_tokens, laser_encoder_out, lang_id=target_language_id
+        )
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--decoder-lang-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder language embedding dimension",
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        base_laser_transformer_architecture(args)
+        num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
+        def load_embed_tokens(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            return Embedding(num_embeddings, embed_dim, padding_idx)
+        encoder_embed_tokens = load_embed_tokens(
+            task.source_dictionary, args.encoder_embed_dim
+        )
+        decoder_embed_tokens = load_embed_tokens(
+            task.target_dictionary, args.decoder_embed_dim
+        )
+        num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0
+        encoder = LaserTransformerEncoder(
+            args, task.source_dictionary, encoder_embed_tokens
+        )
+        decoder = LaserTransformerDecoder(
+            args,
+            task.target_dictionary,
+            decoder_embed_tokens,
+            num_langs=num_langs,
+            lang_embed_dim=args.decoder_lang_embed_dim,
+        )
+        return cls(encoder, decoder)
+class LaserTransformerEncoder(TransformerEncoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, src_tokens, *args, **kwargs):
+        encoder_out = super().forward(src_tokens, *args, **kwargs)
+        x = encoder_out["encoder_out"][0]  # T x B x C
+        padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
+        if padding_mask.any():
+            x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)
+        # Build the sentence embedding by max-pooling over the encoder outputs
+        sentemb = x.max(dim=0)[0]
+        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
+        # `foward` so we use a dictionary instead.
+        # TorchScript does not support mixed values so the values are all lists.
+        # The empty list is equivalent to None.
+        return {"sentemb": [sentemb]}  # B x C
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """
+        Same as the one in transformer.py, with new_sentemb
+        """
+        if len(encoder_out["sentemb"]) == 0:
+            new_sentemb = []
+        else:
+            new_sentemb = [encoder_out["sentemb"][0].index_select(0, new_order)]
+        return {
+            "sentemb": new_sentemb,  # B x C
+        }
+class LaserTransformerDecoder(TransformerDecoder):
+    def __init__(self, args, dictionary, *kargs, **kwargs):
+        self.num_langs = kwargs.get("num_langs", 1)
+        self.lang_embed_dim = kwargs.get("lang_embed_dim", 0)
+        kwargs.pop("num_langs", None)
+        kwargs.pop("lang_embed_dim", None)
+        super().__init__(args, dictionary, *kargs, **kwargs, no_encoder_attn=True)
+        if self.lang_embed_dim == 0:
+            self.embed_lang = None
+        else:
+            self.embed_lang = nn.Embedding(self.num_langs, self.lang_embed_dim)
+            nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1)
+        if self.output_projection is not None:
+            laser_output_embed_dim = (
+                self.output_embed_dim + self.lang_embed_dim + args.encoder_embed_dim
+            )
+            self.output_projection = nn.Linear(
+                laser_output_embed_dim, len(dictionary), bias=False
+            )
+            nn.init.normal_(
+                self.output_projection.weight,
+                mean=0,
+                std=laser_output_embed_dim ** -0.5,
+            )
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        decoder_embed_dim = args.decoder_embed_dim
+        args.decoder_embed_dim = (
+            decoder_embed_dim + self.lang_embed_dim + args.encoder_embed_dim
+        )
+        res = TransformerDecoderLayer(args, no_encoder_attn=True)
+        args.decoder_embed_dim = decoder_embed_dim
+        return res
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        lang_id: Optional[int] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+        bsz, seqlen = prev_output_tokens.size()
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+        if positions is not None:
+            x += positions
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        x = self.dropout_module(x)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        if self.embed_lang is not None:
+            lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id)
+            langemb = self.embed_lang(lang_ids)
+            langemb = langemb.unsqueeze(0)
+            repeat_vals = [x.shape[0] // langemb.shape[0]] + [-1] * (
+                len(langemb.shape) - 1
+            )
+            x = torch.cat((x, langemb.expand(*repeat_vals)), dim=-1)
+        sentemb = encoder_out["sentemb"][0]
+        sentemb = sentemb.unsqueeze(0)
+        repeat_vals = [x.shape[0] // sentemb.shape[0]] + [-1] * (len(sentemb.shape) - 1)
+        x = torch.cat((x, sentemb.expand(*repeat_vals)), dim=-1)
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+            x, layer_attn, _ = layer(
+                x,
+                None,
+                None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+        return x, {"attn": [attn], "inner_states": inner_states}
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+        lang_id: Optional[int] = None,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        assert lang_id is not None
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            lang_id=lang_id,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+@register_model_architecture("laser_transformer", "laser_transformer")
+def base_laser_transformer_architecture(args):
+    base_architecture(args)
+    args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0)
--- a/examples/laser/laser_src/multitask_data_utils.py
+++ b/examples/laser/laser_src/multitask_data_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import numpy as np
+from fairseq.data import BaseWrapperDataset, FairseqDataset, iterators
+class MultiItr(object):
+    def __init__(self, itr):
+        self.itr = itr
+        self._counts = [0 for x in itr]
+    def __len__(self):
+        return sum(len(itr) for itr in self.itr)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        ratios = [count / len(itr) for count, itr in zip(self._counts, self.itr)]
+        idx = ratios.index(min(ratios))
+        self._counts[idx] += 1
+        return next(self.itr[idx])
+class MultidatasetEpochBatchIterator(iterators.EpochBatchIterating):
+    """A wrapper around multiple epoch batch iterators."""
+    def __init__(
+        self,
+        dataset,
+        batch_sampler,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+    ):
+        assert isinstance(dataset, OrderedDict)
+        assert len(dataset)
+        assert isinstance(dataset[next(iter(dataset))], FairseqDataset)
+        self.iterators = []
+        self.epoch = epoch
+        for key, dt in dataset.items():
+            epoch_iter = iterators.EpochBatchIterator(
+                dataset=dt,
+                collate_fn=dt.collater,
+                batch_sampler=batch_sampler[key],
+                seed=seed,
+                num_shards=num_shards,
+                shard_id=shard_id,
+                num_workers=0,
+                epoch=epoch,
+            )
+            self.iterators.append(epoch_iter)
+    def __len__(self):
+        return sum(len(itr) for itr in self.iterators)
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        # `self.epoch += 1` should be handled by underlying `EpochBatchIterator`s.
+        return MultiItr(
+            [
+                itr.next_epoch_itr(
+                    shuffle=shuffle, fix_batches_to_gpus=fix_batches_to_gpus
+                )
+                for itr in self.iterators
+            ]
+        )
+    def end_of_epoch(self):
+        return all(itr.end_of_epoch() for itr in self.iterators)
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        epochs = [itr.next_epoch_idx for itr in self.iterators]
+        self.epoch = epochs[0]
+        assert all(epoch == self.epoch for epoch in epochs)
+        return self.epoch
+    @property
+    def iterations_in_epoch(self):
+        return sum(itr.iterations_in_epoch for itr in self.iterators)
+    def state_dict(self):
+        return {
+            "iterators": [it.state_dict() for it in self.iterators],
+            "epoch": self.epoch,
+        }
+    def load_state_dict(self, state_dict):
+        self.epoch = state_dict["epoch"]
+        for it, d in zip(self.iterators, state_dict["iterators"]):
+            it.load_state_dict(d)
+class MultitaskDatasetWrapper(BaseWrapperDataset):
+    """A wrapper for a multitask dataset."""
+    def __init__(self, dataset, target_language_id, sample=1.0, name=""):
+        super().__init__(dataset)
+        self.target_language_id = target_language_id
+        self.sample = sample
+        self.name = name
+    def collater(self, *args, **kwargs):
+        ans = self.dataset.collater(*args, **kwargs)
+        if "net_input" in ans:
+            ans["net_input"]["target_language_id"] = self.target_language_id
+            ans["net_input"]["dataset_name"] = self.name
+        return ans
+    def num_tokens(self, *args, **kwargs):
+        return self.dataset.num_tokens(*args, **kwargs)
+    def ordered_indices(self, *args, **kwargs):
+        indices = self.dataset.ordered_indices(*args, **kwargs)
+        # Hacky solution for sampling
+        size = int(self.sample * indices.shape[0])
+        return indices.take(np.sort(np.random.permutation(indices.shape[0])[:size]))
+    def size(self, index: int):
+        return self.dataset.size(index)
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return getattr(self.dataset, "supports_prefetch", False)
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)