Commit 18d27e00 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parent 541f4c7a
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Translate pre-processed data with a trained model.
"""
import numpy as np
import torch
from fairseq import checkpoint_utils, options, progress_bar, tasks, utils
from fairseq.sequence_generator import EnsembleModel
def get_avg_pool(
models, sample, prefix_tokens, src_dict, remove_bpe, has_langtok=False
):
model = EnsembleModel(models)
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input = {
k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
}
# compute the encoder output for each beam
encoder_outs = model.forward_encoder(encoder_input)
np_encoder_outs = encoder_outs[0].encoder_out.cpu().numpy().astype(np.float32)
encoder_mask = 1 - encoder_outs[0].encoder_padding_mask.cpu().numpy().astype(
np.float32
)
encoder_mask = np.expand_dims(encoder_mask.T, axis=2)
if has_langtok:
encoder_mask = encoder_mask[1:, :, :]
np_encoder_outs = np_encoder_outs[1, :, :]
masked_encoder_outs = encoder_mask * np_encoder_outs
avg_pool = (masked_encoder_outs / encoder_mask.sum(axis=0)).sum(axis=0)
return avg_pool
def main(args):
assert args.path is not None, "--path required for generation!"
assert (
not args.sampling or args.nbest == args.beam
), "--sampling requires --nbest to be equal to --beam"
assert (
args.replace_unk is None or args.raw_text
), "--replace-unk requires a raw text dataset (--raw-text)"
args.beam = 1
utils.import_user_module(args)
if args.max_tokens is None:
args.max_tokens = 12000
print(args)
use_cuda = torch.cuda.is_available() and not args.cpu
# Load dataset splits
task = tasks.setup_task(args)
task.load_dataset(args.gen_subset)
# Set dictionaries
try:
src_dict = getattr(task, "source_dictionary", None)
except NotImplementedError:
src_dict = None
tgt_dict = task.target_dictionary
# Load ensemble
print("| loading model(s) from {}".format(args.path))
models, _model_args = checkpoint_utils.load_model_ensemble(
args.path.split(":"),
arg_overrides=eval(args.model_overrides),
task=task,
)
# Optimize ensemble for generation
for model in models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
need_attn=args.print_alignment,
)
if args.fp16:
model.half()
if use_cuda:
model.cuda()
# Load alignment dictionary for unknown word replacement
# (None if no unknown word replacement, empty if no path to align dictionary)
align_dict = utils.load_align_dict(args.replace_unk)
# Load dataset (possibly sharded)
itr = task.get_batch_iterator(
dataset=task.dataset(args.gen_subset),
max_tokens=args.max_tokens,
max_positions=utils.resolve_max_positions(
task.max_positions(),
),
ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
required_batch_size_multiple=args.required_batch_size_multiple,
num_shards=args.num_shards,
shard_id=args.shard_id,
num_workers=args.num_workers,
).next_epoch_itr(shuffle=False)
num_sentences = 0
source_sentences = []
shard_id = 0
all_avg_pool = None
encoder_has_langtok = (
hasattr(task.args, "encoder_langtok")
and task.args.encoder_langtok is not None
and hasattr(task.args, "lang_tok_replacing_bos_eos")
and not task.args.lang_tok_replacing_bos_eos
)
with progress_bar.build_progress_bar(args, itr) as t:
for sample in t:
if sample is None:
print("Skipping None")
continue
sample = utils.move_to_cuda(sample) if use_cuda else sample
if "net_input" not in sample:
continue
prefix_tokens = None
if args.prefix_size > 0:
prefix_tokens = sample["target"][:, : args.prefix_size]
with torch.no_grad():
avg_pool = get_avg_pool(
models,
sample,
prefix_tokens,
src_dict,
args.remove_bpe,
has_langtok=encoder_has_langtok,
)
if all_avg_pool is not None:
all_avg_pool = np.concatenate((all_avg_pool, avg_pool))
else:
all_avg_pool = avg_pool
if not isinstance(sample["id"], list):
sample_ids = sample["id"].tolist()
else:
sample_ids = sample["id"]
for i, sample_id in enumerate(sample_ids):
# Remove padding
src_tokens = utils.strip_pad(
sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
)
# Either retrieve the original sentences or regenerate them from tokens.
if align_dict is not None:
src_str = task.dataset(args.gen_subset).src.get_original_text(
sample_id
)
else:
if src_dict is not None:
src_str = src_dict.string(src_tokens, args.remove_bpe)
else:
src_str = ""
if not args.quiet:
if src_dict is not None:
print("S-{}\t{}".format(sample_id, src_str))
source_sentences.append(f"{sample_id}\t{src_str}")
num_sentences += sample["nsentences"]
if all_avg_pool.shape[0] >= 1000000:
with open(
f"{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}",
"w",
) as avg_pool_file:
all_avg_pool.tofile(avg_pool_file)
with open(
f"{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}",
"w",
) as sentence_file:
sentence_file.writelines(f"{line}\n" for line in source_sentences)
all_avg_pool = None
source_sentences = []
shard_id += 1
if all_avg_pool is not None:
with open(
f"{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}", "w"
) as avg_pool_file:
all_avg_pool.tofile(avg_pool_file)
with open(
f"{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}", "w"
) as sentence_file:
sentence_file.writelines(f"{line}\n" for line in source_sentences)
return None
def cli_main():
parser = options.get_generation_parser()
parser.add_argument(
"--encoder-save-dir",
default="",
type=str,
metavar="N",
help="directory to save encoder outputs",
)
args = options.parse_args_and_arch(parser)
main(args)
if __name__ == "__main__":
cli_main()
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import glob
import numpy as np
DIM = 1024
def compute_dist(source_embs, target_embs, k=5, return_sim_mat=False):
target_ids = [tid for tid in target_embs]
source_mat = np.stack(source_embs.values(), axis=0)
normalized_source_mat = source_mat / np.linalg.norm(
source_mat, axis=1, keepdims=True
)
target_mat = np.stack(target_embs.values(), axis=0)
normalized_target_mat = target_mat / np.linalg.norm(
target_mat, axis=1, keepdims=True
)
sim_mat = normalized_source_mat.dot(normalized_target_mat.T)
if return_sim_mat:
return sim_mat
neighbors_map = {}
for i, sentence_id in enumerate(source_embs):
idx = np.argsort(sim_mat[i, :])[::-1][:k]
neighbors_map[sentence_id] = [target_ids[tid] for tid in idx]
return neighbors_map
def load_embeddings(directory, LANGS):
sentence_embeddings = {}
sentence_texts = {}
for lang in LANGS:
sentence_embeddings[lang] = {}
sentence_texts[lang] = {}
lang_dir = f"{directory}/{lang}"
embedding_files = glob.glob(f"{lang_dir}/all_avg_pool.{lang}.*")
for embed_file in embedding_files:
shard_id = embed_file.split(".")[-1]
embeddings = np.fromfile(embed_file, dtype=np.float32)
num_rows = embeddings.shape[0] // DIM
embeddings = embeddings.reshape((num_rows, DIM))
with open(f"{lang_dir}/sentences.{lang}.{shard_id}") as sentence_file:
for idx, line in enumerate(sentence_file):
sentence_id, sentence = line.strip().split("\t")
sentence_texts[lang][sentence_id] = sentence
sentence_embeddings[lang][sentence_id] = embeddings[idx, :]
return sentence_embeddings, sentence_texts
def compute_accuracy(directory, LANGS):
sentence_embeddings, sentence_texts = load_embeddings(directory, LANGS)
top_1_accuracy = {}
top1_str = " ".join(LANGS) + "\n"
for source_lang in LANGS:
top_1_accuracy[source_lang] = {}
top1_str += f"{source_lang} "
for target_lang in LANGS:
top1 = 0
top5 = 0
neighbors_map = compute_dist(
sentence_embeddings[source_lang], sentence_embeddings[target_lang]
)
for sentence_id, neighbors in neighbors_map.items():
if sentence_id == neighbors[0]:
top1 += 1
if sentence_id in neighbors[:5]:
top5 += 1
n = len(sentence_embeddings[target_lang])
top1_str += f"{top1/n} "
top1_str += "\n"
print(top1_str)
print(top1_str, file=open(f"{directory}/accuracy", "w"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze encoder outputs")
parser.add_argument("directory", help="Source language corpus")
parser.add_argument("--langs", help="List of langs")
args = parser.parse_args()
langs = args.langs.split(",")
compute_accuracy(args.directory, langs)
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
source_lang=kk_KZ
target_lang=en_XX
MODEL=criss_checkpoints/criss.3rd.pt
SPM=criss_checkpoints/sentence.bpe.model
SPLIT=test
LANG_DICT=criss_checkpoints/lang_dict.txt
ENCODER_ANALYSIS=sentence_retrieval/encoder_analysis.py
SAVE_ENCODER=save_encoder.py
ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL
DATA_DIR=data_tmp
INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba
ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang}
mkdir -p $ENCODER_SAVE_DIR/${target_lang}
mkdir -p $ENCODER_SAVE_DIR/${source_lang}
# Save encoder outputs for source sentences
python $SAVE_ENCODER \
${INPUT_DIR} \
--path ${MODEL} \
--task translation_multi_simple_epoch \
--lang-dict ${LANG_DICT} \
--gen-subset ${SPLIT} \
--bpe 'sentencepiece' \
--lang-pairs ${source_lang}-${target_lang} \
-s ${source_lang} -t ${target_lang} \
--sentencepiece-model ${SPM} \
--remove-bpe 'sentencepiece' \
--beam 1 \
--lang-tok-style mbart \
--encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang}
# Save encoder outputs for target sentences
python $SAVE_ENCODER \
${INPUT_DIR} \
--path ${MODEL} \
--lang-dict ${LANG_DICT} \
--task translation_multi_simple_epoch \
--gen-subset ${SPLIT} \
--bpe 'sentencepiece' \
--lang-pairs ${target_lang}-${source_lang} \
-t ${source_lang} -s ${target_lang} \
--sentencepiece-model ${SPM} \
--remove-bpe 'sentencepiece' \
--beam 1 \
--lang-tok-style mbart \
--encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang}
# Analyze sentence retrieval accuracy
python $ENCODER_ANALYSIS --langs "${source_lang},${target_lang}" ${ENCODER_SAVE_DIR}
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
SRC=si_LK
TGT=en_XX
MODEL=criss_checkpoints/criss.3rd.pt
MULTIBLEU=mosesdecoder/scripts/generic/multi-bleu.perl
MOSES=mosesdecoder
REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl
GEN_TMP_DIR=gen_tmp
LANG_DICT=criss_checkpoints/lang_dict.txt
if [ ! -d "mosesdecoder" ]; then
git clone https://github.com/moses-smt/mosesdecoder
fi
mkdir -p $GEN_TMP_DIR
fairseq-generate data_tmp/${SRC}-${TGT}-flores \
--task translation_multi_simple_epoch \
--max-tokens 2000 \
--path ${MODEL} \
--skip-invalid-size-inputs-valid-test \
--beam 5 --lenpen 1.0 --gen-subset test \
--remove-bpe=sentencepiece \
--source-lang ${SRC} --target-lang ${TGT} \
--decoder-langtok --lang-pairs 'en_XX-ar_AR,en_XX-de_DE,en_XX-es_XX,en_XX-fr_XX,en_XX-hi_IN,en_XX-it_IT,en_XX-ja_XX,en_XX-ko_KR,en_XX-nl_XX,en_XX-ru_RU,en_XX-zh_CN,en_XX-tr_TR,en_XX-vi_VN,en_XX-ro_RO,en_XX-my_MM,en_XX-ne_NP,en_XX-si_LK,en_XX-cs_CZ,en_XX-lt_LT,en_XX-kk_KZ,en_XX-gu_IN,en_XX-fi_FI,en_XX-et_EE,en_XX-lv_LV,ar_AR-en_XX,cs_CZ-en_XX,de_DE-en_XX,es_XX-en_XX,et_EE-en_XX,fi_FI-en_XX,fr_XX-en_XX,gu_IN-en_XX,hi_IN-en_XX,it_IT-en_XX,ja_XX-en_XX,kk_KZ-en_XX,ko_KR-en_XX,lt_LT-en_XX,lv_LV-en_XX,my_MM-en_XX,ne_NP-en_XX,nl_XX-en_XX,ro_RO-en_XX,ru_RU-en_XX,si_LK-en_XX,tr_TR-en_XX,vi_VN-en_XX,zh_CN-en_XX,ar_AR-es_XX,es_XX-ar_AR,ar_AR-hi_IN,hi_IN-ar_AR,ar_AR-zh_CN,zh_CN-ar_AR,cs_CZ-es_XX,es_XX-cs_CZ,cs_CZ-hi_IN,hi_IN-cs_CZ,cs_CZ-zh_CN,zh_CN-cs_CZ,de_DE-es_XX,es_XX-de_DE,de_DE-hi_IN,hi_IN-de_DE,de_DE-zh_CN,zh_CN-de_DE,es_XX-hi_IN,hi_IN-es_XX,es_XX-zh_CN,zh_CN-es_XX,et_EE-es_XX,es_XX-et_EE,et_EE-hi_IN,hi_IN-et_EE,et_EE-zh_CN,zh_CN-et_EE,fi_FI-es_XX,es_XX-fi_FI,fi_FI-hi_IN,hi_IN-fi_FI,fi_FI-zh_CN,zh_CN-fi_FI,fr_XX-es_XX,es_XX-fr_XX,fr_XX-hi_IN,hi_IN-fr_XX,fr_XX-zh_CN,zh_CN-fr_XX,gu_IN-es_XX,es_XX-gu_IN,gu_IN-hi_IN,hi_IN-gu_IN,gu_IN-zh_CN,zh_CN-gu_IN,hi_IN-zh_CN,zh_CN-hi_IN,it_IT-es_XX,es_XX-it_IT,it_IT-hi_IN,hi_IN-it_IT,it_IT-zh_CN,zh_CN-it_IT,ja_XX-es_XX,es_XX-ja_XX,ja_XX-hi_IN,hi_IN-ja_XX,ja_XX-zh_CN,zh_CN-ja_XX,kk_KZ-es_XX,es_XX-kk_KZ,kk_KZ-hi_IN,hi_IN-kk_KZ,kk_KZ-zh_CN,zh_CN-kk_KZ,ko_KR-es_XX,es_XX-ko_KR,ko_KR-hi_IN,hi_IN-ko_KR,ko_KR-zh_CN,zh_CN-ko_KR,lt_LT-es_XX,es_XX-lt_LT,lt_LT-hi_IN,hi_IN-lt_LT,lt_LT-zh_CN,zh_CN-lt_LT,lv_LV-es_XX,es_XX-lv_LV,lv_LV-hi_IN,hi_IN-lv_LV,lv_LV-zh_CN,zh_CN-lv_LV,my_MM-es_XX,es_XX-my_MM,my_MM-hi_IN,hi_IN-my_MM,my_MM-zh_CN,zh_CN-my_MM,ne_NP-es_XX,es_XX-ne_NP,ne_NP-hi_IN,hi_IN-ne_NP,ne_NP-zh_CN,zh_CN-ne_NP,nl_XX-es_XX,es_XX-nl_XX,nl_XX-hi_IN,hi_IN-nl_XX,nl_XX-zh_CN,zh_CN-nl_XX,ro_RO-es_XX,es_XX-ro_RO,ro_RO-hi_IN,hi_IN-ro_RO,ro_RO-zh_CN,zh_CN-ro_RO,ru_RU-es_XX,es_XX-ru_RU,ru_RU-hi_IN,hi_IN-ru_RU,ru_RU-zh_CN,zh_CN-ru_RU,si_LK-es_XX,es_XX-si_LK,si_LK-hi_IN,hi_IN-si_LK,si_LK-zh_CN,zh_CN-si_LK,tr_TR-es_XX,es_XX-tr_TR,tr_TR-hi_IN,hi_IN-tr_TR,tr_TR-zh_CN,zh_CN-tr_TR,vi_VN-es_XX,es_XX-vi_VN,vi_VN-hi_IN,hi_IN-vi_VN,vi_VN-zh_CN,zh_CN-vi_VN' \
--lang-dict ${LANG_DICT} --lang-tok-style 'mbart' --sampling-method 'temperature' --sampling-temperature '1.0' > $GEN_TMP_DIR/${SRC}_${TGT}.gen
cat $GEN_TMP_DIR/${SRC}_${TGT}.gen | grep -P "^T-" | cut -f2 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l ${TGT:0:2} | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape ${TGT:0:2} > $GEN_TMP_DIR/${SRC}_${TGT}.hyp
cat $GEN_TMP_DIR/${SRC}_${TGT}.gen | grep -P "^H-" | cut -f3 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l ${TGT:0:2} | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape ${TGT:0:2} > $GEN_TMP_DIR/${SRC}_${TGT}.ref
${MULTIBLEU} $GEN_TMP_DIR/${SRC}_${TGT}.ref < $GEN_TMP_DIR/${SRC}_${TGT}.hyp
# Cross-Lingual Language Model Pre-training
Below are some details for training Cross-Lingual Language Models (XLM) - similar to the ones presented in [Lample & Conneau, 2019](https://arxiv.org/pdf/1901.07291.pdf) - in Fairseq. The current implementation only supports the Masked Language Model (MLM) from the paper above.
## Downloading and Tokenizing Monolingual Data
Pointers to the monolingual data from wikipedia, used for training the XLM-style MLM model as well as details on processing (tokenization and BPE) it can be found in the [XLM Github Repository](https://github.com/facebookresearch/XLM#download--preprocess-monolingual-data).
Let's assume the following for the code snippets in later sections to work
- Processed data is in the folder: monolingual_data/processed
- Each language has 3 files for train, test and validation. For example we have the following files for English:
train.en, valid.en
- We are training a model for 5 languages: Arabic (ar), German (de), English (en), Hindi (hi) and French (fr)
- The vocabulary file is monolingual_data/processed/vocab_mlm
## Fairseq Pre-processing and Binarization
Pre-process and binarize the data with the MaskedLMDictionary and cross_lingual_lm task
```bash
# Ensure the output directory exists
DATA_DIR=monolingual_data/fairseq_processed
mkdir -p "$DATA_DIR"
for lg in ar de en hi fr
do
fairseq-preprocess \
--task cross_lingual_lm \
--srcdict monolingual_data/processed/vocab_mlm \
--only-source \
--trainpref monolingual_data/processed/train \
--validpref monolingual_data/processed/valid \
--testpref monolingual_data/processed/test \
--destdir monolingual_data/fairseq_processed \
--workers 20 \
--source-lang $lg
# Since we only have a source language, the output file has a None for the
# target language. Remove this
for stage in train test valid
sudo mv "$DATA_DIR/$stage.$lg-None.$lg.bin" "$stage.$lg.bin"
sudo mv "$DATA_DIR/$stage.$lg-None.$lg.idx" "$stage.$lg.idx"
done
done
```
## Train a Cross-lingual Language Model similar to the XLM MLM model
Use the following command to train the model on 5 languages.
```
fairseq-train \
--task cross_lingual_lm monolingual_data/fairseq_processed \
--save-dir checkpoints/mlm \
--max-update 2400000 --save-interval 1 --no-epoch-checkpoints \
--arch xlm_base \
--optimizer adam --lr-scheduler reduce_lr_on_plateau \
--lr-shrink 0.5 --lr 0.0001 --min-lr 1e-09 \
--dropout 0.1 \
--criterion legacy_masked_lm_loss \
--max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \
--dataset-impl lazy --seed 0 \
--masked-lm-only \
--monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \
--ddp-backend=no_c10d
```
Some Notes:
- Using tokens_per_sample greater than 256 can cause OOM (out-of-memory) issues. Usually since MLM packs in streams of text, this parameter doesn't need much tuning.
- The Evaluation workflow for computing MLM Perplexity on test data is in progress.
- Finetuning this model on a downstream task is something which is not currently available.
# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
## Training a joint alignment-translation model on WMT'18 En-De
##### 1. Extract and preprocess the WMT'18 En-De data
```bash
./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
```
##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
In this example, we use FastAlign.
```bash
git clone git@github.com:clab/fast_align.git
pushd fast_align
mkdir build
cd build
cmake ..
make
popd
ALIGN=fast_align/build/fast_align
paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
```
##### 3. Preprocess the dataset with the above generated alignments.
```bash
fairseq-preprocess \
--source-lang en --target-lang de \
--trainpref bpe.32k/train \
--validpref bpe.32k/valid \
--testpref bpe.32k/test \
--align-suffix align \
--destdir binarized/ \
--joined-dictionary \
--workers 32
```
##### 4. Train a model
```bash
fairseq-train \
binarized \
--arch transformer_wmt_en_de_big_align --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
--lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 3500 --label-smoothing 0.1 \
--save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
--keep-interval-updates -1 --save-interval-updates 0 \
--load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
--fp16
```
Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
If you want to train the above model with big batches (assuming your machine has 8 GPUs):
- add `--update-freq 8` to simulate training on 8x8=64 GPUs
- increase the learning rate; 0.0007 works well for big batches
##### 5. Evaluate and generate the alignments (BPE level)
```bash
fairseq-generate \
binarized --gen-subset test --print-alignment \
--source-lang en --target-lang de \
--path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
```
##### 6. Other resources.
The code for:
1. preparing alignment test sets
2. converting BPE level alignments to token level alignments
3. symmetrizing bidirectional alignments
4. evaluating alignments using AER metric
can be found [here](https://github.com/lilt/alignment-scripts)
## Citation
```bibtex
@inproceedings{garg2019jointly,
title = {Jointly Learning to Align and Translate with Transformer Models},
author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
address = {Hong Kong},
month = {November},
url = {https://arxiv.org/abs/1909.02074},
year = {2019},
}
```
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
URLS=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
"http://statmt.org/wmt14/test-full.tgz"
)
CORPORA=(
"training/europarl-v7.de-en"
"commoncrawl.de-en"
"training-parallel-nc-v13/news-commentary-v13.de-en"
"rapid2016.de-en"
)
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src=en
tgt=de
lang=en-de
prep=wmt18_en_de
tmp=$prep/tmp
orig=orig
dev=dev/newstest2012
codes=32000
bpe=bpe.32k
mkdir -p $orig $tmp $prep $bpe
cd $orig
for ((i=0;i<${#URLS[@]};++i)); do
url=${URLS[i]}
file=$(basename $url)
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit 1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
fi
fi
done
cd ..
echo "pre-processing train data..."
for l in $src $tgt; do
rm -rf $tmp/train.tags.$lang.tok.$l
for f in "${CORPORA[@]}"; do
cat $orig/$f.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
done
done
echo "pre-processing test data..."
for l in $src $tgt; do
if [ "$l" == "$src" ]; then
t="src"
else
t="ref"
fi
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
sed -e 's/<seg id="[0-9]*">\s*//g' | \
sed -e 's/\s*<\/seg>\s*//g' | \
sed -e "s/\’/\'/g" | \
perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
echo ""
done
# apply length filtering before BPE
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
# use newstest2012 for valid
echo "pre-processing valid data..."
for l in $src $tgt; do
rm -rf $tmp/valid.$l
cat $orig/$dev.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
done
mkdir output
mv $tmp/{train,valid,test}.{$src,$tgt} output
#BPE
git clone https://github.com/glample/fastBPE.git
pushd fastBPE
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
popd
fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done
# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
## Pre-trained models
Description | Parameters | Dataset | Model and Test set(s)
---|---:|---|---
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
## Training an LM with adaptive inputs
First, see the general [language modeling README](README.md) for instructions on
preprocessing the WikiText-103 data.
Then use the following training command to train a model with adaptive inputs
using the `transformer_lm_wiki103` model architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm_wiki103 \
--max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
--warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
--criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
--sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
```
## Citation
```bibtex
@inproceedings{
baevski2018adaptive,
title={Adaptive Input Representations for Neural Language Modeling},
author={Alexei Baevski and Michael Auli},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=ByxZX20qFQ},
}
```
# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
## Example usage
First download and preprocess the data following the main [language modeling README](README.md).
Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/fconv_wikitext-103 \
--arch fconv_lm_dauphin_wikitext103 \
--adaptive-softmax-cutoff 10000,20000,200000 \
--dropout 0.2 \
--criterion adaptive_loss \
--optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
--lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
--max-tokens 1024 --tokens-per-sample 1024 \
--ddp-backend no_c10d \
--max-epoch 35
```
And evaluate with:
```bash
fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
```
## Citation
```bibtex
@inproceedings{dauphin2017language,
title={Language Modeling with Gated Convolutional Networks},
author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
pages={933--941},
year={2017},
organization={JMLR}
}
```
# Neural Language Modeling
## Pre-trained models
Model | Description | Dataset | Download
---|---|---|---
`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
## Example usage
We require a few additional Python dependencies for preprocessing:
```bash
pip install fastBPE sacremoses
```
To sample from a language model using PyTorch Hub:
```python
import torch
# List available models
torch.hub.list('pytorch/fairseq') # [..., 'transformer_lm.wmt19.en', ...]
# Load an English LM trained on WMT'19 News Crawl data
en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
en_lm.eval() # disable dropout
# Move model to GPU
en_lm.cuda()
# Sample from the language model
en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
# "Barack Obama is coming to Sydney and New Zealand (...)"
# Compute perplexity for a sequence
en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
# tensor(15.1474)
# The same interface can be used with custom models as well
from fairseq.models.transformer_lm import TransformerLanguageModel
custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
custom_lm.sample('Barack Obama', beam=5)
# "Barack Obama (...)"
```
## Training a transformer language model with the CLI tools
### 1) Preprocess the data
First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
```bash
cd examples/language_model/
bash prepare-wikitext-103.sh
cd ../..
```
Next preprocess/binarize the data:
```bash
TEXT=examples/language_model/wikitext-103
fairseq-preprocess \
--only-source \
--trainpref $TEXT/wiki.train.tokens \
--validpref $TEXT/wiki.valid.tokens \
--testpref $TEXT/wiki.test.tokens \
--destdir data-bin/wikitext-103 \
--workers 20
```
### 2) Train a language model
Next we'll train a basic transformer language model on wikitext-103. For more
advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
To train a basic LM (assumes 2 GPUs):
```
$ fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm --share-decoder-input-output-embed \
--dropout 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
--lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--tokens-per-sample 512 --sample-break-mode none \
--max-tokens 2048 --update-freq 16 \
--fp16 \
--max-update 50000
```
If you run out of memory, try reducing `--max-tokens` (max number of tokens per
batch) or `--tokens-per-sample` (max sequence length). You can also adjust
`--update-freq` to accumulate gradients and simulate training on a different
number of GPUs.
### 3) Evaluate
```bash
fairseq-eval-lm data-bin/wikitext-103 \
--path checkpoints/transformer_wiki103/checkpoint_best.pt \
--batch-size 2 \
--tokens-per-sample 512 \
--context-window 400
# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
# | Loss: 3.4164, Perplexity: 30.46
```
*Note:* The `--context-window` option controls how much context is provided to
each token when computing perplexity. When the window size is 0, the dataset is
chunked into segments of length 512 and perplexity is computed over each segment
normally. However, this results in worse (higher) perplexity since tokens that
appear earlier in each segment have less conditioning. When the maximum window
size is used (511 in this case), then we compute perplexity for each token
fully conditioned on 511 tokens of context. This slows down evaluation
significantly, since we must run a separate forward pass for every token in the
dataset, but results in better (lower) perplexity.
## Convolutional language models
Please see the [convolutional LM README](README.conv.md) for instructions on
training convolutional language models.
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
URLS=(
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
)
FILES=(
"wikitext-103-v1.zip"
)
for ((i=0;i<${#URLS[@]};++i)); do
file=${FILES[i]}
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
url=${URLS[i]}
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit -1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
elif [ ${file: -4} == ".zip" ]; then
unzip $file
fi
fi
done
cd ..
# Deep Transformers with Latent Depth (Li et al., 2020)
[https://arxiv.org/abs/2009.13102](https://arxiv.org/abs/2009.13102).
## Introduction
We present a probabilistic framework to automatically learn which layer(s) to use by learning the posterior distributions of layer selection. As an extension of this framework, we propose a novel method to train one shared Transformer network for multilingual machine translation with different layer selection posteriors for each language pair.
## Training a multilingual model with latent depth
Below is an example of training with latent depth in decoder for one-to-many (O2M) related languages. We use the same preprocessed (numberized and binarized) TED8 dataset as in [Balancing Training for Multilingual Neural Machine Translation (Wang et al., 2020)](https://github.com/cindyxinyiwang/multiDDS), which could be generated by [the script](https://github.com/cindyxinyiwang/multiDDS/blob/multiDDS/util_scripts/prepare_multilingual_data.sh) the author provided.
```bash
lang_pairs_str="eng-aze,eng-bel,eng-ces,eng-glg,eng-por,eng-rus,eng-slk,eng-tur"
databin_dir=<path to binarized data>
fairseq-train ${databin_dir} \
--user-dir examples/latent_depth/latent_depth_src \
--lang-pairs "${lang_pairs_str}" \
--arch multilingual_transformer_iwslt_de_en \
--task multilingual_translation_latent_depth \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--share-encoders \
--share-decoders \
--decoder-langtok \
--share-decoder-input-output-embed \
--dropout 0.3 --attention-dropout 0.3 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--lr-scheduler inverse_sqrt --min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \
--max-tokens 4096 --update-freq 1 \
--lr 0.0015 \
--clip-norm 1.0 \
--seed 2 \
--ddp-backend=no_c10d \
--encoder-layers 12 \
--decoder-layers 24 \
--decoder-latent-layer \
--sparsity-weight 0.1 \
--anneal-updates 5000 \
--soft-update 500 \
--target-layers 12 \
--share-weight 0.1
```
## Inference command
```bash
lang_pairs_str="eng-aze,eng-bel,eng-ces,eng-glg,eng-por,eng-rus,eng-slk,eng-tur"
databin_dir=<path to binarized data>
model_path=<path to checkpoint>
src_lang=<source language to translate from>
tgt_lang=<target language to translate to>
gen_data=<name of data split, e.g. valid, test, etc>
fairseq-generate ${databin_dir} \
--path ${model_path} \
--task multilingual_translation_latent_depth \
--decoder-latent-layer \
--lang-pairs "${lang_pairs_str}" \
-s ${src_lang} -t ${tgt_lang} \
--gen-subset $gen_data \
--scoring sacrebleu \
--remove-bpe 'sentencepiece' \
--lenpen 1.0 \
--beam 5 \
--decoder-langtok \
--max-tokens 4096
```
## Citation
```bibtex
@article{li2020deep,
title={Deep Transformers with Latent Depth},
author={Li, Xian and Stickland, Asa Cooper and Tang, Yuqing and Kong, Xiang},
journal={arXiv preprint arXiv:2009.13102},
year={2020}
}
```
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import multilingual_translation_latent_depth # noqa
from .loss import latent_depth # noqa
from .models import latent_multilingual_transformer # noqa
from .modules import latent_layers # noqa
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
import torch
from torch.nn.modules.loss import _Loss
class LatentLayersKLLoss(_Loss):
def __init__(self, args):
super().__init__()
self.args = args
def forward(self, layer_samples, lang_idx, update_num, sample_size):
prior = self.args.prior
samples = layer_samples[lang_idx]
eps = 1e-7
if prior == "uniform":
# uniform prior
kl_loss = (samples * (torch.log(samples + eps) - math.log(0.5))).sum(-1)
elif prior == "agged_posterior":
# aggregated posterior
y_t = torch.stack([x.detach() for x in layer_samples], dim=0)
agged_q = torch.sum(y_t, dim=0)
row_norm = agged_q.sum(-1)
normed_agg_q = agged_q / row_norm
kl_loss = (
samples * (torch.log(samples + eps) - torch.log(normed_agg_q + eps))
).sum(-1)
else:
raise NotImplementedError("The specified prior is not implemented.")
# normalized by number of layers
kl_loss /= layer_samples[0].size()[0]
kl_weight = min(
self.args.sparsity_weight,
(update_num - self.args.soft_update)
* self.args.sparsity_weight
/ self.args.anneal_updates,
)
kl_loss *= kl_weight * sample_size
return kl_loss
class LatentLayersSparsityLoss(_Loss):
def __init__(self, args):
super().__init__()
self.args = args
def is_valid(self, update_num):
if self.args.target_layers <= 0:
return False
return update_num > (self.args.soft_update + self.args.anneal_updates)
def forward(self, layer_samples_list, update_num, sample_size):
batch_loss = 0
share_loss = 0
global_sparsity_loss = 0
layer_samples = torch.stack(layer_samples_list, dim=0)
if (
self.args.target_layers > 0 or self.args.share_weight > 0
) and update_num > (self.args.soft_update + self.args.anneal_updates):
# anneal sparsity weight
if update_num < (self.args.anneal_updates + self.args.soft_update):
weight_anneal = 0
elif update_num < (2 * self.args.anneal_updates + self.args.soft_update):
weight_anneal = (
(update_num - self.args.soft_update - self.args.anneal_updates)
* self.args.share_weight
/ self.args.anneal_updates
)
else:
weight_anneal = 1
# compute ratio among languages
layer_utilization = torch.sum(layer_samples, dim=0)
layer_utilization /= layer_samples.size()[0]
if self.args.share_weight > 0:
# encouraging sharing across languages
share_loss = sum(
-1.0 * v * math.log(v) for v in layer_utilization if v > 0
)
batch_loss += (
weight_anneal * self.args.share_weight * sample_size * share_loss
)
if self.args.target_layers > 0:
# computed expected number of layers selected
expeted_layers = sum(layer_utilization)
# compute l2 loss wrt target number of layers
global_sparsity_loss = (expeted_layers - self.args.target_layers) ** 2
batch_loss += (
weight_anneal
* self.args.share_weight
* sample_size
* global_sparsity_loss
)
return batch_loss
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment