Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
CWD=`pwd`
INSTALL_PATH=$CWD/tokenizers/thirdparty
MOSES=$INSTALL_PATH/mosesdecoder
if [ ! -d $MOSES ]; then
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git $MOSES
cd $MOSES
# To deal with differences in handling ' vs "
git checkout 03578921cc1a03402
cd -
fi
WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts
if [ ! -d $WMT16_SCRIPTS ]; then
echo 'Cloning Romanian tokenization scripts'
git clone https://github.com/rsennrich/wmt16-scripts.git $WMT16_SCRIPTS
fi
KYTEA=$INSTALL_PATH/kytea
if [ ! -f $KYTEA/bin/kytea ]; then
git clone https://github.com/neubig/kytea.git $KYTEA
cd $KYTEA
autoreconf -i
./configure --prefix=`pwd`
make
make install
cd ..
fi
export MECAB=$INSTALL_PATH/mecab-0.996-ko-0.9.2
if [ ! -f $MECAB/bin/mecab ]; then
cd $INSTALL_PATH
curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
tar zxfv mecab-0.996-ko-0.9.2.tar.gz
cd mecab-0.996-ko-0.9.2/
./configure --prefix=`pwd`
make
make install
cd ..
curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz
tar zxfv mecab-ko-dic-2.1.1-20180720.tar.gz
cd mecab-ko-dic-2.1.1-20180720/
./autogen.sh
./configure --prefix=`pwd` --with-dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic --with-mecab-config=$MECAB/bin/mecab-config
make
sh -c 'echo "dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic" > $MECAB/etc/mecabrc'
make install
cd $CWD
fi
INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources
if [ ! -d $INDIC_RESOURCES_PATH ]; then
echo 'Cloning indic_nlp_resources'
git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git $INDIC_RESOURCES_PATH
fi
if [ ! -f $INSTALL_PATH/seg_my.py ]; then
cd $INSTALL_PATH
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip
unzip wat2020.my-en.zip
# switch to python3
cat wat2020.my-en/myseg.py |sed 's/^sys.std/###sys.std/g' | sed 's/### sys/sys/g' | sed 's/unichr/chr/g' > seg_my.py
cd $CWD
fi
pip install pythainlp sacrebleu indic-nlp-library
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--src', type=str, help='Source language')
parser.add_argument('--tgt', type=str, help='Target language')
parser.add_argument('--src-file', type=str, help='Input source file')
parser.add_argument('--tgt-file', type=str, help='Input target file')
parser.add_argument('--src-output-file', type=str, help='Output source file')
parser.add_argument('--tgt-output-file', type=str, help='Output target file')
parser.add_argument('--threshold', type=float, default=0.5, help='Threshold')
parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character')
parser.add_argument('--histograms', type=str, help='Path to histograms')
args = parser.parse_args()
def read_hist(f):
ch = []
for line in f:
c = line[0]
if c == args.threshold_character:
break
ch.append(c)
return ch
with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f:
ch1 = read_hist(f)
with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f:
ch2 = read_hist(f)
print("Accepted characters for {}: {}".format(args.src, ch1))
print("Accepted characters for {}: {}".format(args.tgt, ch2))
with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2:
ls1 = fs1.readline()
ls2 = fs2.readline()
while ls1 or ls2:
cnt1 = len([c for c in ls1.strip() if c in ch1])
cnt2 = len([c for c in ls2.strip() if c in ch2])
if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold:
fos1.write(ls1)
fos2.write(ls2)
else:
print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip()))
ls1 = fs1.readline()
ls2 = fs2.readline()
\ No newline at end of file
import argparse
from collections import namedtuple
import os
DATADIR = "/path/to/train_data"
DEDUP_FROM_DIR = "/path/to/eval/data"
OUTPUT_DIR = "/path/to/output/data"
def main(args):
languages = set()
for language_directory in os.listdir(DATADIR):
if "_" in language_directory:
src, tgt = language_directory.split("_")
languages.add(LanguagePair(src=src, tgt=tgt))
data = existing_data()
train_languages = sorted(languages)
for language_pair in train_languages[args.start_index:args.start_index + args.size]:
print(language_pair)
dedup(language_pair, data)
LanguagePair = namedtuple("LanguagePair", ["src", "tgt"])
def existing_data():
data = set()
for file in os.listdir(DEDUP_FROM_DIR):
with open(os.path.join(DEDUP_FROM_DIR, file)) as f:
data |= set(f.readlines())
return data
def dedup(language_pair, data, verbose=True, output=True):
train_filenames = LanguagePair(
src=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.src}",
tgt=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.tgt}",
)
output_filenames = LanguagePair(
src=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.src}",
tgt=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.tgt}"
)
# If output exists, skip this pair. It has already been done.
if (os.path.exists(output_filenames.src) and
os.path.exists(output_filenames.tgt)):
if verbose:
print(f"{language_pair.src}-{language_pair.tgt} already done.")
return
if verbose:
print(f"{language_pair.src}-{language_pair.tgt} ready, will check dups.")
# If there is no output, no need to actually do the loop.
if not output:
return
if os.path.exists(train_filenames.src) and os.path.exists(train_filenames.tgt):
with open(train_filenames.src) as f:
train_source = f.readlines()
with open(train_filenames.tgt) as f:
train_target = f.readlines()
# do dedup
new_train_source = []
new_train_target = []
for i, train_line in enumerate(train_source):
if train_line not in data and train_target[i] not in data:
new_train_source.append(train_line)
new_train_target.append(train_target[i])
assert len(train_source) == len(train_target)
assert len(new_train_source) == len(new_train_target)
assert len(new_train_source) <= len(train_source)
with open(output_filenames.src, "w") as o:
for line in new_train_source:
o.write(line)
with open(output_filenames.tgt, "w") as o:
for line in new_train_target:
o.write(line)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--start-index", required=True, type=int)
parser.add_argument("-n", "--size", required=True, type=int)
main(parser.parse_args())
import gzip
import argparse
from string import punctuation
def len_no_punc(s, punc):
return len([ch for ch in s if ch in punc])
def filter_overpunc(len_npunc, len_sen):
return len_npunc < 0.5*len_sen
def main(args):
punc = punctuation + "—|–"
print('Processing file {}'.format(args.input))
with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv:
with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt:
line = tsv.readline()
fields = line.split('\t')
src, tgt = fields[1], fields[2]
nchar_npunc_src = len_no_punc(src, punc)
nchar_npunc_tgt = len_no_punc(tgt, punc)
if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)):
fsrc.write(src.strip() + '\n')
ftgt.write(tgt.strip() + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, type=str)
parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output')
parser.add_argument('--bitext', type=str, required=True, help='language direction')
parser.add_argument('--src-lang', type=str, required=True, help='Source language')
parser.add_argument('--tgt-lang', type=str, required=True, help='Target language')
main(parser.parse_args())
#!/usr/bin/env bash
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
set -e
TOKENIZERS_SCRIPTS=tokenizers
INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty
N_THREADS=8
lg=$1
MOSES=$INSTALL_PATH/mosesdecoder
REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl
# special tokenization for Romanian
WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts
NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py
REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py
# Burmese
MY_SEGMENT=$INSTALL_PATH/seg_my.py
# Arabic
AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh
# Korean
KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh
# Japanese
JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh
# Indic
IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py
INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources
# Thai
THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py
# Chinese
CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py
# Chinese
if [ "$lg" = "zh" ]; then
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER
# Thai
elif [ "$lg" = "th" ]; then
cat - | python $THAI_TOKENIZER
# Japanese
elif [ "$lg" = "ja" ]; then
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT}
# Korean
elif [ "$lg" = "ko" ]; then
cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT}
# Romanian
elif [ "$lg" = "ro" ]; then
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
# Burmese
elif [ "$lg" = "my" ]; then
cat - | python ${MY_SEGMENT}
# Arabic
elif [ "$lg" = "ar" ]; then
cat - | ${AR_TOKENIZER}
# Indic
elif [ "$lg" = "ne" ]; then
cat - | python ${IN_TOKENIZER} $lg
elif [ "$lg" = "si" ]; then
cat - | python ${IN_TOKENIZER} $lg
elif [ "$lg" = "hi" ]; then
cat - | python ${IN_TOKENIZER} $lg
# other languages
else
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
fi
# M2M-100 Tokenization
We apply different tokenization strategies for different languages following the existing literature. Here we provide tok.sh a tokenizer that can be used to reproduce our results.
To reproduce the results, follow these steps:
```
tgt_lang=...
reference_translation=...
cat generation_output | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh $tgt_lang > hyp
cat $reference_translation |sh tok.sh $tgt_lang > ref
sacrebleu -tok 'none' ref < hyp
```
## Installation
Tools needed for all the languages except Arabic can be installed by running install_dependencies.sh
If you want to evaluate Arabic models, please follow the instructions provided here: http://alt.qcri.org/tools/arabic-normalizer/ to install
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
SCRIPT=`realpath $0`
KYTEA=`dirname $SCRIPT`/thirdparty/kytea
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KYTEA/lib:/usr/local/lib
export PATH=$PATH:"$KYTEA/bin"
cat - | tr -d "[:blank:]" | kytea -notags
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
SCRIPT=`realpath $0`
MECAB=`dirname $SCRIPT`/thirdparty/mecab-0.996-ko-0.9.2
export PATH=$PATH:"$MECAB/bin":"$MECAB/lib"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"$MECAB/lib"
cat - | mecab -O wakati
seg_my.py
indic_nlp_library/
indic_nlp_resources/
kytea/
mecab-0.996-ko-0.9.2.tar.gz
mecab-0.996-ko-0.9.2/
mosesdecoder/
wat2020.my-en.zip
wat2020.my-en/
wmt16-scripts/
mecab-ko-dic-2.1.1-20180720/
mecab-ko-dic-2.1.1-20180720.tar.gz
\ No newline at end of file
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Use: echo {text} | python tokenize_indic.py {language}
import sys
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize.indic_tokenize import trivial_tokenize
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer(
sys.argv[1], remove_nuktas=False, nasals_mode="do_nothing"
)
for line in sys.stdin:
normalized_line = normalizer.normalize(line.strip())
tokenized_line = " ".join(trivial_tokenize(normalized_line, sys.argv[1]))
print(tokenized_line)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
from pythainlp import word_tokenize
for line in sys.stdin:
print(" ".join(word_tokenize(line.strip())))
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import fileinput
import sacrebleu
for line in fileinput.input():
print(sacrebleu.tokenize_zh(line))
#!/usr/bin/env sh
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Please follow the instructions here http://alt.qcri.org/tools/arabic-normalizer/
# to install tools needed for Arabic
echo "Please install Arabic tools: http://alt.qcri.org/tools/arabic-normalizer/"
echo "Then update environment variables in tokenizer_ar.sh"
exit 1
SVMTOOL=...
GOMOSESGO=...
QCRI_ARABIC_NORMALIZER=...
export PERL5LIB="$SVMTOOL/lib":"$GOMOSESGO/bin/MADA-3.2":$PERL5LIB
tempfile=$(mktemp)
cat - > $tempfile
cd $QCRI_ARABIC_NORMALIZER
bash qcri_normalizer_mada3.2_aramorph1.2.1.sh $tempfile
cat $tempfile.mada_norm-aramorph.europarl_tok
# MBART: Multilingual Denoising Pre-training for Neural Machine Translation
[https://arxiv.org/abs/2001.08210]
## Introduction
MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text.
## Pre-trained models
Model | Description | # params | Download
---|---|---|---
`mbart.CC25` | mBART model with 12 encoder and decoder layers trained on 25 languages' monolingual corpus | 610M | [mbart.CC25.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz)
`mbart.ft.ro_en` | finetune mBART cc25 model on ro-en language pairs | 610M | [mbart.cc25.ft.enro.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.ft.enro.tar.gz)
## Results
**[WMT16 EN-RO](https://www.statmt.org/wmt16/translation-task.html)**
_(test set, no additional data used)_
Model | en-ro | ro-en
---|---|---
`Random` | 34.3 | 34.0
`mbart.cc25` | 37.7 | 37.8
`mbart.enro.bilingual` | 38.5 | 38.5
## BPE data
# download model
wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
tar -xzvf mbart.CC25.tar.gz
# bpe data
install SPM [here](https://github.com/google/sentencepiece)
```bash
SPM=/path/to/sentencepiece/build/src/spm_encode
MODEL=sentence.bpe.model
${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${SRC} > ${DATA}/${TRAIN}.spm.${SRC} &
${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${TGT} > ${DATA}/${TRAIN}.spm.${TGT} &
${SPM} --model=${MODEL} < ${DATA}/${VALID}.${SRC} > ${DATA}/${VALID}.spm.${SRC} &
${SPM} --model=${MODEL} < ${DATA}/${VALID}.${TGT} > ${DATA}/${VALID}.spm.${TGT} &
${SPM} --model=${MODEL} < ${DATA}/${TEST}.${SRC} > ${DATA}/${TEST}.spm.${SRC} &
${SPM} --model=${MODEL} < ${DATA}/${TEST}.${TGT} > ${DATA}/${TEST}.spm.${TGT} &
```
## Preprocess data
```bash
DICT=dict.txt
fairseq-preprocess \
--source-lang ${SRC} \
--target-lang ${TGT} \
--trainpref ${DATA}/${TRAIN}.spm \
--validpref ${DATA}/${VALID}.spm \
--testpref ${DATA}/${TEST}.spm \
--destdir ${DEST}/${NAME} \
--thresholdtgt 0 \
--thresholdsrc 0 \
--srcdict ${DICT} \
--tgtdict ${DICT} \
--workers 70
```
## Finetune on EN-RO
Finetune on mbart CC25
```bash
PRETRAIN=mbart.cc25 # fix if you moved the downloaded checkpoint
langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
fairseq-train path_2_data \
--encoder-normalize-before --decoder-normalize-before \
--arch mbart_large --layernorm-embedding \
--task translation_from_pretrained_bart \
--source-lang en_XX --target-lang ro_RO \
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 40000 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 1024 --update-freq 2 \
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
--seed 222 --log-format simple --log-interval 2 \
--restore-file $PRETRAIN \
--reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
--langs $langs \
--ddp-backend legacy_ddp
```
## Generate on EN-RO
Get sacrebleu on finetuned en-ro model
get tokenizer [here](https://github.com/rsennrich/wmt16-scripts)
```bash
wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.ft.enro.tar.gz
tar -xzvf mbart.cc25.ft.enro.tar.gz
```
```bash
model_dir=MBART_finetuned_enro # fix if you moved the checkpoint
fairseq-generate path_2_data \
--path $model_dir/model.pt \
--task translation_from_pretrained_bart \
--gen-subset test \
-t ro_RO -s en_XX \
--bpe 'sentencepiece' --sentencepiece-model $model_dir/sentence.bpe.model \
--sacrebleu --remove-bpe 'sentencepiece' \
--batch-size 32 --langs $langs > en_ro
cat en_ro | grep -P "^H" |sort -V |cut -f 3- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.hyp
cat en_ro | grep -P "^T" |sort -V |cut -f 2- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.ref
sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp
```
## Citation
```bibtex
@article{liu2020multilingual,
title={Multilingual Denoising Pre-training for Neural Machine Translation},
author={Yinhan Liu and Jiatao Gu and Naman Goyal and Xian Li and Sergey Edunov and Marjan Ghazvininejad and Mike Lewis and Luke Zettlemoyer},
year={2020},
eprint={2001.08210},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
# Megatron-11b
Megatron-11b is a unidirectional language model with `11B` parameters based on [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf). Following the original Megatron work, we trained the model using intra-layer model parallelism with each layer's parameters split across 8 GPUs.
Megatron-11b is trained on the same data and uses the same byte-pair encoding (BPE) as [RoBERTa](https://arxiv.org/pdf/1907.11692.pdf).
## Pre-trained models
Model | Description | # params | # filesize | Download
---|---|---|---|---
`megatron_11b` | megatron_11b unidirectional language model | 11B | 19Gb | [megatron_11b.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/model_parallel/megatron_11b.tar.gz)
#### Architecture:
Param | Value
---|---
embed_dim | 3072
ffn_dim | 3072 * 6
layers | 72
attention heads | 32
#### Training details:
Param | value
---|---
bsz | 512
num_updates | 300,000
peak_lr | 1.5e-04
lr scheduler | inverse_sqrt
clip norm | 0.0
## Example training command (model parallel)
Megatron-11b contains too many parameters to train on a single GPU. Following
the original Megatron work, we adopt an intra-layer model parallel training
approach in which each layer's parameters are split across multiple GPUs and
activations and gradients are communicated during the forward/backward pass,
respectively. We similarly split the loss computation using the
`vocab_parallel_cross_entropy` criterion.
The following training command illustrates how to do model parallel training in
fairseq. We assume that each machine (node) has 8 GPUs among which to split the
model parameters (`--model-parallel-size 8`). If you have access to multiple
nodes, you may combine this with data parallel training by increasing
`--distributed-world-size`.
To train Megatron-11b on a single node:
```bash
fairseq-train <DATA_PATH> \
--distributed-world-size 8 \
--memory-efficient-fp16 \
--num-workers 2 \
--model-parallel-size 8 \
--criterion vocab_parallel_cross_entropy \
--task language_modeling \
--sample-break-mode none \
--tokens-per-sample 1024 \
--arch transformer_lm_megatron_11b \
--share-decoder-input-output-embed \
--optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 --clip-norm 0.0 \
--lr-scheduler inverse_sqrt --lr 0.00015 \
--warmup-updates 3000 --weight-decay 0.01 \
--dropout 0.1 --attention-dropout 0.1 \
--batch-size 2 \
--max-update 300000;
```
Note: Above was tested on `DGX-1` box, with `8xV100-32Gb` GPUs.
## Results
**[Wikitext103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)**
Model | Valid perplexity | Test perplexity
---|---|---
`megatron_11b` | 10.64 | 10.54
## Evaluating `megatron_11b` on Wikitext-103
#### 1. Downloading Megatron-11b
```bash
# WARNING: this file is 19GB
wget https://dl.fbaipublicfiles.com/fairseq/models/model_parallel/megatron_11b.tar.gz
tar -xzvf megatron_11b.tar.gz
```
#### 2. Download Wikitext-103
```bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
```
#### 3. Detokenize test tokens
Megatron-11b uses a byte-level BPE that expects raw (untokenized) input. Since
the wikitext-103 dataset comes tokenized, we apply a simple detokenization
process to restore the untokenized test set:
```bash
python -m examples.megatron_11b.detok wikitext-103-raw/wiki.test.raw > wikitext-103-raw/wiki.test.detok
```
#### 4. BPE encoding
```bash
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "wikitext-103-raw/wiki.test.detok" \
--outputs "wikitext-103-raw/wiki.test.bpe" \
--workers 60;
```
#### 5. Fairseq binarize
```bash
fairseq-preprocess \
--only-source \
--testpref wikitext-103-raw/wiki.test.bpe \
--srcdict megatron_11b/dict.txt \
--destdir wikitext103-bin;
```
#### 6. Evaluating perplexity.
We can now evaluate perplexity on the test set. Note that because we've modified
the test set (via detokenization and BPE), the perplexity reported by
`fairseq-eval-lm` needs to be renormalized.
Compute unnormalized perplexity:
```bash
DATA_PATH=wikitext103-bin/
fairseq-eval-lm \
$DATA_PATH \
--path megatron_11b/model.pt \
--task language_modeling \
--gen-subset test \
--batch-size 8 \
--criterion cross_entropy \
--context-window 992 \
--distributed-world-size 8 \
--model-parallel-size 8;
# Expected PPL (unnormalized_ppl): [8.46]
# Note: the eval command needs to run on 8 GPUs for the released model
```
Renormalizing formula: `2 ^ ( log_2(unnormalized_PPL) * (270847 / 245566))`.
PPL After normalization: `10.54`
To renormalize the perplexity, we must account for the change in token count
after detokenizing and appling BPE. The formula for this is:
`2 ^ ( log_2(unnormalized_PPL) * (new_token_cnt / orig_token_cnt))`
For the wikitext-103 test set, the original token count is `245566` and the
token count after detokenization and applying BPE is `270847`.
The perplexity after renormalization is:
`2 ^ ( log_2(8.46) * (270847 / 245566)) = 10.54`
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fileinput
import sacremoses
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("files", nargs="*", help="input files")
args = parser.parse_args()
detok = sacremoses.MosesDetokenizer()
for line in fileinput.input(args.files, openhook=fileinput.hook_compressed):
print(
detok.detokenize(line.strip().split(" "))
.replace(" @", "")
.replace("@ ", "")
.replace(" =", "=")
.replace("= ", "=")
.replace(" – ", "–")
)
if __name__ == "__main__":
main()
# MMS Model Card
## Model details
**Organization developing the model** The FAIR team of Meta AI.
**Model version** This is version 1 of the model.
**Model type** MMS is speech model, based on the transformer architecture. The pre-trained model comes in two sizes: 300M and 1B parameters. We fine-tune the model for speech recognition and make it available in the 1B variant. We also fine-tune the 1B variant for language identification.
**License** CC BY-NC
**Where to send questions or comments about the model** Questions and comments about MMS can be sent via the [GitHub repository](https://github.com/pytorch/fairseq/tree/master/examples/mms) of the project , by opening an issue and tagging it as MMS.
## Uses
**Primary intended uses** The primary use of MMS is to perform speech processing research for many more languages and to perform tasks such as automatic speech recognition, language identification, and speech synthesis.
**Primary intended users** The primary intended users of the model are researchers in speech processing, machine learning and artificial intelligence.
**Out-of-scope use cases** Fine-tuning the pre-pretrained models on other labeled datasets or downstream tasks requires further risk evaluation and mitigation.
## Bias and Risks
The MMS models were pre-trained on a blend of data from different domains, including readings of the New Testament. In the paper, we describe two studies analyzing gender bias and the use of religious language which conclude that models perform equally well for both genders and that on average, there is little bias for religious language (section 8 of the paper).
# Training Details
## Training Data
MMS is pre-trained on VoxPopuli (parliamentary speech), MLS (read audiobooks), VoxLingua-107 (YouTube speech), CommonVoice (read Wikipedia text), BABEL (telephone conversations), and MMS-lab-U (New Testament readings), MMS-unlab (various read Christian texts).
Models are fine-tuned on FLEURS, VoxLingua-107, MLS, CommonVoice, and MMS-lab. We obtained the language information for MMS-lab, MMS-lab-U and MMS-unlab from our data soucrce and did not manually verify it for every language.
## Training Procedure
Please refer to the research paper for details on this.
# Evaluation
## Testing Data, Factors & Metrics
We evaluate the model on a different benchmarks for the downstream tasks. The evaluation details are presented in the paper. The models performance is measured using standard metrics such as character error rate, word error rate, and classification accuracy.
# Citation
**BibTeX:**
```
@article{pratap2023mms,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
journal={arXiv},
year={2023}
}
```
# Model Card Contact
Please reach out to the authors at: [vineelkpratap@meta.com](mailto:vineelkpratap@meta.com) [androstj@meta.com](mailto:androstj@meta.com) [bshi@meta.com](mailto:bshi@meta.com) [michaelauli@meta.com](mailto:michaelauli@gmail.com)
# MMS: Scaling Speech Technology to 1000+ languages
The Massively Multilingual Speech (MMS) project expands speech technology from about 100 languages to over 1,000 by building a single multilingual speech recognition model supporting over 1,100 languages (more than 10 times as many as before), language identification models able to identify over [4,000 languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) (40 times more than before), pretrained models supporting over 1,400 languages, and text-to-speech models for over 1,100 languages. Our goal is to make it easier for people to access information and to use devices in their preferred language.
You can find details in the paper [Scaling Speech Technology to 1000+ languages](https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/) and the [blog post](https://ai.facebook.com/blog/multilingual-model-speech-recognition/).
An overview of the languages covered by MMS can be found [here](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html).
## 🤗 Transformers
MMS has been added to Transformers. For more information, please refer to [Transformers' MMS docs](https://huggingface.co/docs/transformers/main/en/model_doc/mms).
[Click here](https://huggingface.co/models?other=mms) to find all MMS checkpoints on the Hub.
Checkout the demo here [![Open In HF Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg)](https://huggingface.co/spaces/facebook/MMS)
## Finetuned models
### ASR
| Model | Languages | Dataset | Model | Dictionary* | Supported languages | |
|---|---|---|---|---|---|---
MMS-1B:FL102 | 102 | FLEURS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-fl102)
MMS-1B:L1107| 1107 | MMS-lab | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_l1107/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-l1107)
MMS-1B-all| 1162 | MMS-lab + FLEURS <br>+ CV + VP + MLS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_all/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-all)
\* In the `Dictionary` column, we provide the download link for token dictionary in English language. To download token dictionary for a different language supported by the model, modify the language code in the URL appropriately. For example, to get token dictionary of FL102 model for Hindi language, use [this](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/hin.txt) link.
### TTS
1. Download the list of [iso codes](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) of 1107 languages.
2. Find the iso code of the target language and download the checkpoint. Each folder contains 3 files: `G_100000.pth`, `config.json`, `vocab.txt`. The `G_100000.pth` is the generator trained for 100K updates, `config.json` is the training config, `vocab.txt` is the vocabulary for the TTS model.
```
# Examples:
wget https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz # English (eng)
wget https://dl.fbaipublicfiles.com/mms/tts/azj-script_latin.tar.gz # North Azerbaijani (azj-script_latin)
```
The above command downloads generator only, which is enough to run TTS inference. If you want the full model checkpoint which also includes the discriminator (`D_100000.pth`) and the optimizer states, download as follows.
```
# Example (full checkpoint: generator + discriminator + optimizer):
wget https://dl.fbaipublicfiles.com/mms/tts/full_model/eng.tar.gz # English (eng)
```
### LID
\# Languages | Dataset | Model | Dictionary | Supported languages | |
|---|---|---|---|---|---
126 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-126)
256 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l256/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-256)
512 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l512/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-512)
1024 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l1024/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-1024)
2048 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l2048/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-2048)
4017 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l4017/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-4017)
## Commands to run inference
### ASR
Run this command to transcribe one or more audio files:
```shell command
cd /path/to/fairseq-py/
python examples/mms/asr/infer/mms_infer.py --model "/path/to/asr/model" --lang lang_code \
--audio "/path/to/audio_1.wav" "/path/to/audio_2.wav" "/path/to/audio_3.wav"
```
We also provide an Ipython notebook example inside `asr/tutorial` folder [ipynb](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb)
For more advance configuration and calculate CER/WER, you could prepare manifest folder by creating a folder with this format:
```
$ ls /path/to/manifest
dev.tsv
dev.wrd
dev.ltr
dev.uid
# dev.tsv each line contains <audio> <number_of_sample>
# if user don't have this information, please run misc/get_sample_size.py
$ cat dev.tsv
/
/path/to/audio_1.wav 180000
/path/to/audio_2.wav 200000
$ cat dev.ltr
t h i s | i s | o n e |
t h i s | i s | t w o |
$ cat dev.wrd
this is one
this is two
$ cat dev.uid
audio_1
audio_2
```
Followed by command below:
```
lang_code=<iso_code>
PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python examples/speech_recognition/new/infer.py -m --config-dir examples/mms/config/ --config-name infer_common decoding.type=viterbi dataset.max_tokens=4000000 distributed_training.distributed_world_size=1 "common_eval.path='/path/to/asr/model'" task.data='/path/to/manifest' dataset.gen_subset="${lang_code}:dev" common_eval.post_process=letter
```
Available options:
* To get the raw character-based output, user can change to `common_eval.post_process=none`
* To maximize GPU efficiency or avoid out-of-memory (OOM), user can tune `dataset.max_tokens=???` size
* To run language model decoding, install flashlight python bindings using
```
git clone --recursive git@github.com:flashlight/flashlight.git
cd flashlight;
git checkout 035ead6efefb82b47c8c2e643603e87d38850076
cd bindings/python
python3 setup.py install
```
Train a [KenLM language model](https://github.com/flashlight/wav2letter/tree/main/recipes/rasr#language-model) and prepare a lexicon file in [this](https://dl.fbaipublicfiles.com/wav2letter/rasr/tutorial/lexicon.txt) format. Pretrained languages models from our paper can be found in [🤗 Hub](https://huggingface.co/facebook/mms-cclms/).
```
LANG=<iso> # for example - 'eng', 'azj-script_latin'
PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python examples/speech_recognition/new/infer.py --config-dir=examples/mms/asr/config \
--config-name=infer_common decoding.type=kenlm distributed_training.distributed_world_size=1 \
decoding.unique_wer_file=true decoding.beam=500 decoding.beamsizetoken=50 \
task.data=<MANIFEST_FOLDER_PATH> common_eval.path='<MODEL_PATH.pt>' decoding.lexicon=<LEXICON_FILE> decoding.lmpath=<LM_FILE> \
decoding.results_path=<OUTPUT_DIR> dataset.gen_subset=${LANG}:dev decoding.lmweight=??? decoding.wordscore=???
```
We typically sweep `lmweight` in the range of 0 to 5 and `wordscore` in the range of -3 to 3. The output directory will contain the reference and hypothesis outputs from decoder.
For decoding with character-based language models, use empty lexicon file (`decoding.lexicon=`), `decoding.unitlm=True` and sweep over `decoding.silweight` instead of `wordscore`.
### TTS
Note: clone and install [VITS](https://github.com/jaywalnut310/vits) before running inference.
```shell script
## English TTS
$ PYTHONPATH=$PYTHONPATH:/path/to/vits python examples/mms/tts/infer.py --model-dir /path/to/model/eng \
--wav ./example.wav --txt "Expanding the language coverage of speech technology \
has the potential to improve access to information for many more people"
## Maithili TTS
$ PYTHONPATH=$PYTHONPATH:/path/to/vits python examples/mms/tts/infer.py --model-dir /path/to/model/mai \
--wav ./example.wav --txt "मुदा आइ धरि ई तकनीक सौ सं किछु बेसी भाषा तक सीमित छल जे सात हजार \
सं बेसी ज्ञात भाषाक एकटा अंश अछी"
```
`example.wav` contains synthesized audio for the language.
We also provide an Ipython notebook example inside `tts/tutorial` folder [ipynb](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/tts/tutorial/MMS_TTS_Inference_Colab.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/facebookresearch/fairseq/blob/main/examples/mms/tts/tutorial/MMS_TTS_Inference_Colab.ipynb)
### LID
Prepare two files in this format. Each manifest line contains <audio> and <number_of_sample>
```
#/path/to/manifest.tsv
/
/path/to/audio1.wav 180000
/path/to/audio2.wav 240000
/path/to/audio3.wav 160000
# /path/to/manifest.lang
eng 1
eng 1
eng 1
```
Download model and the corresponding dictionary file for the LID model.
Use the following command to run inference -
```shell script
$ PYTHONPATH='.' python3 examples/mms/lid/infer.py /path/to/dict/l126/ --path /path/to/models/mms1b_l126.pt \
--task audio_classification --infer-manifest /path/to/manifest.tsv --output-path <OUTDIR>
```
The above command assumes there is a file named `dict.lang.txt` in `/path/to/dict/l126/`. `<OUTDIR>/predictions.txt` will contain the predictions from the model for the audio files in `manifest.tsv`.
We also provide an Ipython notebook example inside `lid/tutorial` folder [ipynb](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/lid/tutorial/MMS_LID_Inference_Colab.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/facebookresearch/fairseq/blob/main/examples/mms/lid/tutorial/MMS_LID_Inference_Colab.ipynb)
## Fine-tuning
### ASR
MMS Adapter fine-tuning has been added to the official 🤗 Transformers examples [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition#connectionist-temporal-classification-with-adapters).
For a more step-by-step explanation of how to fine-tune MMS, please have a look at the blog [**Fine-tuning MMS Adapter Models for Multi-Lingual ASR**](https://huggingface.co/blog/mms_adapters) on 🤗 blogs.
## Pretrained models
| Model | Link | |
|---|---|---
MMS-300M | [download](https://dl.fbaipublicfiles.com/mms/pretraining/base_300m.pt) | [🤗 Hub](https://huggingface.co/facebook/mms-300m)
MMS-1B | [download](https://dl.fbaipublicfiles.com/mms/pretraining/base_1b.pt) | [🤗 Hub](https://huggingface.co/facebook/mms-1b)
Example commands to finetune the pretrained models can be found [here](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#fine-tune-a-pre-trained-model-with-ctc).
## Forced Alignment Tooling
We also developed an efficient forced alignment algorithm implemented on GPU which is able to process very long audio files. This algorithm is open sourced and we provide instructions on how to use it [here](data_prep). We also open source a multilingual alignment model trained on 31K hours of data in 1,130 languages, as well as text normalization scripts.
# License
The MMS code and model weights are released under the CC-BY-NC 4.0 license.
# Citation
**BibTeX:**
```
@article{pratap2023mms,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
journal={arXiv},
year={2023}
}
```
# @package _global_
# defaults:
# - hydra/launcher: submitit_slurm
# @package _group_
task:
_name: audio_finetuning
data: null
labels: ltr
common_eval:
path: null
post_process: letter
# model_overrides: "{'task':{'multi_corpus_keys':None}}"
decoding:
type: viterbi
lexicon: null
unique_wer_file: false
results_path: null
distributed_training:
ddp_backend: legacy_ddp
distributed_world_size: 1
hydra:
run:
dir: ${common_eval.results_path}/${dataset.gen_subset}
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${common_eval.results_path}
subdir: ${dataset.gen_subset}
dataset:
max_tokens: 2_000_000
gen_subset: dev
required_batch_size_multiple: 1
#!/bin/bash
lang="$1"
PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python examples/speech_recognition/new/infer.py -m --config-dir examples/mms/asr/config/ --config-name infer_common decoding.type=viterbi dataset.max_tokens=4000000 distributed_training.distributed_world_size=1 "common_eval.path='/fsx-wav2vec/androstj/exps/wav2vec/mms/v4/finetune/xl1b_d5_dfls_0_0.3_u300k__ft_on_d5_127_dbeta1/ft_smax_adp_common.seed:1__dataset.max_tokens:2880000__optimization.lr:[0.001]__optimization.max_update:4000__merged_ckpt/checkpoints/checkpoint_last.pt'" task.data=/fsx-wav2vec/androstj/dataset/v4/fl/fseq dataset.gen_subset="${lang}:${lang}/dev" common_eval.post_process=none
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment