Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: true
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
joint_conf:
join_dim: 512
prejoin_linear: True
postjoin_linear: false
joint_mode: 'add'
activation: 'tanh'
predictor: rnn
predictor_conf:
embed_size: 256
output_size: 256
embed_dropout: 0.1
hidden_size: 256
num_layers: 2
bias: true
rnn_type: 'lstm'
dropout: 0.1
decoder: bitransformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 3
positional_dropout_rate: 0.1
r_num_blocks: 3
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid transducer+ctc+attention
model_conf:
transducer_weight: 0.75
ctc_weight: 0.1
attention_weight: 0.15
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 1650
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'dynamic' # static or dynamic
max_frames_in_batch: 4000
grad_clip: 4
accum_grad: 1
max_epoch: 140
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
../s0/local/
\ No newline at end of file
../s0/path.sh
\ No newline at end of file
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=-1 # start from 0 if you need to start from data preparation
stop_stage=7
# data
data_url=www.openslr.org/resources/12
# data_url=https://us.openslr.org/resources/12
data_url=https://openslr.elda.org/resources/12
# use your own data path
datadir=
# wav data dir
wave_data=data
# Optional train_config
# 1. conf/train_transformer_large.yaml: Standard transformer
train_config=conf/conformer_rnnt.yaml
checkpoint=
cmvn=true
do_delta=false
dir=exp/conformer_transducer
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
# maybe you can try to adjust it if you can not get close results as README.md
average_num=10
decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
. tools/parse_options.sh || exit 1;
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
set -e
set -u
set -o pipefail
train_set=train_960
dev_set=dev
recog_set="test_clean test_other dev_clean dev_other"
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
for part in train-clean-100 train-clean-360 train-other-500; do
local/download_and_untar.sh ${datadir} ${data_url} ${part}
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: Data preparation"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
# use underscore-separated names in data directories.
local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_}
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
mkdir -p $wave_data/train_960
# merge total training data
for set in train_clean_100 train_clean_360 train_other_500; do
for f in `ls $wave_data/$set`; do
cat $wave_data/$set/$f >> $wave_data/train_960/$f
done
done
mkdir -p $wave_data/dev
# merge total dev data
for set in dev_clean dev_other; do
for f in `ls $wave_data/$set`; do
cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
done
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp $wave_data/$train_set/wav.scp \
--out_cmvn $wave_data/$train_set/global_cmvn
fi
dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_char/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in $dev_set ${recog_set} $train_set ; do
tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
$wave_data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
dist_backend="nccl"
cmvn_opts=
$cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
python3 wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type raw \
--symbol_table $dict \
--bpe_model ${bpemodel}.model \
--train_data $wave_data/$train_set/data.list \
--cv_data $wave_data/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $num_gpus \
--ddp.rank $i \
--ddp.dist_backend $dist_backend \
--num_workers 4 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# TODO, Add model average here
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
# Polling GPU id begin with index 0
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
idx=0
for test in $recog_set; do
for mode in ${decode_modes}; do
{
{
test_dir=$dir/${test}_${mode}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type raw \
--dict $dict \
--bpe_model ${bpemodel}.model \
--test_data $wave_data/$test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--result_file $test_dir/text_bpe \
--ctc_weight $ctc_weight \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
python tools/compute-wer.py --char=1 --v=1 \
$wave_data/$test/text $test_dir/text > $test_dir/wer
} &
((idx+=1))
if [ $idx -eq $num_gpus ]; then
idx=0
fi
}
done
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
lm=data/local/lm
lexicon=data/local/dict/lexicon.txt
mkdir -p $lm
mkdir -p data/local/dict
# 7.1 Download & format LM
which_lm=3-gram.pruned.1e-7.arpa.gz
if [ ! -e ${lm}/${which_lm} ]; then
wget http://www.openslr.org/resources/11/${which_lm} -P ${lm}
fi
echo "unzip lm($which_lm)..."
gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa
echo "Lm saved as ${lm}/lm.arpa"
# 7.2 Prepare dict
unit_file=$dict
bpemodel=$bpemodel
# use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel)
# if you download pretrained librispeech conformer model
cp $unit_file data/local/dict/units.txt
if [ ! -e ${lm}/librispeech-lexicon.txt ]; then
wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm}
fi
echo "build lexicon..."
tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \
$lexicon $bpemodel.model
echo "lexicon saved as '$lexicon'"
# 7.3 Build decoding TLG
tools/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
# 7.4 Decoding with runtime
fst_dir=data/lang_test
for test in ${recog_set}; do
./tools/decode.sh --nj 6 \
--beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \
--ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \
--fst_path $fst_dir/TLG.fst \
--dict_path $fst_dir/words.txt \
data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \
$dir/lm_with_runtime_${test}
tail $dir/lm_with_runtime_${test}/wer
done
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result Bidecoder (large)
* Encoder FLOPs(30s): 96,238,430,720, params: 85,709,704
* Feature info: using fbank feature, cmvn, dither, online speed perturb
* Training info: train_conformer_bidecoder_large.yaml, kernel size 31, lr 0.002, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz)
* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz)
* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
| decoding mode | test clean | test other |
|----------------------------------|------------|------------|
| ctc prefix beam search | 2.96 | 7.14 |
| attention rescoring | 2.66 | 6.53 |
| LM-tgmed + attention rescoring | 2.78 | 6.32 |
| LM-tglarge + attention rescoring | 2.68 | 6.10 |
| LM-fglarge + attention rescoring | 2.65 | 5.98 |
## SqueezeFormer Result (U2++, FFN:2048)
* Encoder info:
* SM12, reduce_idx 5, recover_idx 11, conv1d, batch_norm, syncbn
* encoder_dim 512, output_size 512, head 8, ffn_dim 512*4=2048
* Encoder FLOPs(30s): 82,283,704,832, params: 85,984,648
* Feature info:
* using fbank feature, cmvn, dither, online speed perturb, spec_aug
* Training info:
* train_squeezeformer_bidecoder_large.yaml, kernel size 31
* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
* adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
* Decoding info:
* ctc_weight 0.3, reverse weight 0.5, average_num 30
| decoding mode | dev clean | dev other | test clean | test other |
|----------------------------------|-----------|-----------|------------|------------|
| ctc greedy search | 2.55 | 6.62 | 2.73 | 6.59 |
| ctc prefix beam search | 2.53 | 6.60 | 2.72 | 6.52 |
| attention decoder | 2.93 | 6.56 | 3.31 | 6.47 |
| attention rescoring | 2.19 | 6.06 | 2.45 | 5.85 |
## Conformer Result
* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608
* Feature info: using fbank feature, cmvn, dither, online speed perturb
* Training info: train_conformer.yaml, kernel size 31, lr 0.004, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 30
* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78
* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
| decoding mode | test clean | test other |
|----------------------------------|------------|------------|
| ctc greedy search | 3.51 | 9.57 |
| ctc prefix beam search | 3.51 | 9.56 |
| attention decoder | 3.05 | 8.36 |
| attention rescoring | 3.18 | 8.72 |
| attention rescoring (beam 50) | 3.12 | 8.55 |
| LM-fglarge + attention rescoring | 3.09 | 7.40 |
## Conformer Result (12 layers, FFN:2048)
* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608
* Feature info: using fbank feature, cmvn, dither, online speed perturb
* Training info: train_squeezeformer.yaml, kernel size 31,
* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
* AdamW, lr 1e-3, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
| decoding mode | dev clean | dev other | test clean | test other |
|----------------------------------|-----------|-----------|------------|------------|
| ctc greedy search | 3.49 | 9.59 | 3.66 | 9.59 |
| ctc prefix beam search | 3.49 | 9.61 | 3.66 | 9.55 |
| attention decoder | 3.52 | 9.04 | 3.85 | 8.97 |
| attention rescoring | 3.10 | 8.91 | 3.29 | 8.81 |
## SqueezeFormer Result (SM12, FFN:1024)
* Encoder info:
* SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn
* encoder_dim 256, output_size 256, head 4, ffn_dim 256*4=1024
* Encoder FLOPs(30s): 21,158,877,440, params: 22,219,912
* Feature info:
* using fbank feature, cmvn, dither, online speed perturb
* Training info:
* train_squeezeformer.yaml, kernel size 31,
* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
* adamw, lr=1e-3, noamhold, warmup=0.2, hold=0.3, lr_decay=1.0
* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
| decoding mode | dev clean | dev other | test clean | test other |
|----------------------------------|-----------|-----------|------------|------------|
| ctc greedy search | 3.49 | 9.24 | 3.51 | 9.28 |
| ctc prefix beam search | 3.44 | 9.23 | 3.51 | 9.25 |
| attention decoder | 3.59 | 8.74 | 3.75 | 8.70 |
| attention rescoring | 2.97 | 8.48 | 3.07 | 8.44 |
## SqueezeFormer Result (SM12, FFN:2048)
* Encoder info:
* SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn
* encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048
* encoder FLOPs(30s): 28,230,473,984, params: 34,827,400
* Feature info: using fbank feature, cmvn, dither, online speed perturb
* Training info:
* train_squeezeformer.yaml, kernel size 31
* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
* adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0
* Decoding info:
* ctc_weight 0.3, reverse weight 0.5, average_num 30
| decoding mode | dev clean | dev other | test clean | test other |
|----------------------------------|-----------|-----------|------------|------------|
| ctc greedy search | 3.34 | 9.01 | 3.47 | 8.85 |
| ctc prefix beam search | 3.33 | 9.02 | 3.46 | 8.81 |
| attention decoder | 3.64 | 8.62 | 3.91 | 8.33 |
| attention rescoring | 2.89 | 8.34 | 3.10 | 8.03 |
## SqueezeFormer Result (SM12, FFN:1312)
* Encoder info:
* SM12, reduce_idx 5, recover_idx 11, conv1d, w/o syncbn
* encoder_dim 328, output_size 256, head 4, ffn_dim 328*4=1312
* encoder FLOPs(30s): 34,103,960,008, params: 35,678,352
* Feature info:
* using fbank feature, cmvn, dither, online speed perturb
* Training info:
* train_squeezeformer.yaml, kernel size 31,
* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
* adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0
* Decoding info:
* ctc_weight 0.3, reverse weight 0.5, average_num 30
| decoding mode | dev clean | dev other | test clean | test other |
|----------------------------------|-----------|-----------|------------|------------|
| ctc greedy search | 3.20 | 8.46 | 3.30 | 8.58 |
| ctc prefix beam search | 3.18 | 8.44 | 3.30 | 8.55 |
| attention decoder | 3.38 | 8.31 | 3.89 | 8.32 |
| attention rescoring | 2.81 | 7.86 | 2.96 | 7.91 |
## Conformer U2++ Result
* Feature info: using fbank feature, cmvn, no speed perturb, dither
* Training info: train_u2++_conformer.yaml lr 0.001, batch size 24, 8 gpu, acc_grad 1, 120 epochs, dither 1.0
* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
test clean
| decoding mode | full | 16 |
|--------------------------------|------|------|
| ctc prefix beam search | 3.76 | 4.54 |
| attention rescoring | 3.32 | 3.80 |
test other
| decoding mode | full | 16 |
|--------------------------------|-------|-------|
| ctc prefix beam search | 9.50 | 11.52 |
| attention rescoring | 8.67 | 10.38 |
## SqueezeFormer Result (U2++, FFN:2048)
* Encoder info:
* SM12, reduce_idx 5, recover_idx 11, conv1d, layer_norm
* do_rel_shift false, warp_for_time, syncbn
* encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048
* Encoder FLOPs(30s): 28,255,337,984, params: 34,893,704
* Feature info:
* using fbank feature, cmvn, dither, online speed perturb
* Training info:
* train_squeezeformer.yaml, kernel size 31
* batch size 12, 8 gpu, acc_grad 2, 120 epochs, dither 1.0
* adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
* Decoding info:
* ctc_weight 0.3, reverse weight 0.5, average_num 30
test clean
| decoding mode | full | 16 |
|--------------------------------|------|------|
| ctc prefix beam search | 3.45 | 4.34 |
| attention rescoring | 3.07 | 3.71 |
test other
| decoding mode | full | 16 |
|--------------------------------|-------|-------|
| ctc prefix beam search | 8.29 | 10.60 |
| attention rescoring | 7.58 | 9.60 |
## Conformer U2 Result
* Feature info: using fbank feature, cmvn, speed perturb, dither
* Training info: train_unified_conformer.yaml lr 0.001, batch size 10, 8 gpu, acc_grad 1, 120 epochs, dither 1.0
* Decoding info: ctc_weight 0.5, average_num 30
* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78
* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz)
* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz)
* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
test clean
| decoding mode | full | 16 |
|----------------------------------|------|------|
| ctc prefix beam search | 4.26 | 5.00 |
| attention decoder | 3.05 | 3.44 |
| attention rescoring | 3.72 | 4.10 |
| attention rescoring (beam 50) | 3.57 | 3.95 |
| LM-tgmed + attention rescoring | 3.56 | 4.02 |
| LM-tglarge + attention rescoring | 3.40 | 3.82 |
| LM-fglarge + attention rescoring | 3.38 | 3.74 |
test other
| decoding mode | full | 16 |
|----------------------------------|-------|-------|
| ctc prefix beam search | 10.87 | 12.87 |
| attention decoder | 9.07 | 10.44 |
| attention rescoring | 9.74 | 11.61 |
| attention rescoring (beam 50) | 9.34 | 11.13 |
| LM-tgmed + attention rescoring | 8.78 | 10.26 |
| LM-tglarge + attention rescoring | 8.34 | 9.74 |
| LM-fglarge + attention rescoring | 8.17 | 9.44 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# dataset related
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.0005
max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 1
max_epoch: 70
log_interval: 100
optim: adam
optim_conf:
lr: 0.004
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 31
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
cnn_module_norm: 'layer_norm'
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# dataset related
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.0005
max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 4
max_epoch: 120
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 50000
# network architecture
# encoder related
encoder: squeezeformer
encoder_conf:
encoder_dim: 256
output_size: 256 # dimension of attention
attention_heads: 4
num_blocks: 12 # the number of encoder blocks
reduce_idx: 5
recover_idx: 11
pos_enc_layer_type: 'rel_pos'
time_reduction_layer_type: 'conv1d'
feed_forward_expansion_factor: 4
input_dropout_rate: 0.1
feed_forward_dropout_rate: 0.1
attention_dropout_rate: 0.1
cnn_module_kernel: 31
cnn_norm_type: layer_norm
adaptive_scale: true
normalize_before: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# dataset related
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.0005
max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 4
max_epoch: 120
log_interval: 100
optim: adamw
optim_conf:
lr: 1.e-3
weight_decay: 4.e-5
scheduler: NoamHoldAnnealing
scheduler_conf:
warmup_ratio: 0.2
hold_ratio: 0.3
max_steps: 87960
decay_rate: 1.0
min_lr: 1.e-5
# network architecture
# encoder related
encoder: squeezeformer
encoder_conf:
encoder_dim: 512
output_size: 512 # dimension of attention
attention_heads: 8
num_blocks: 12 # the number of encoder blocks
reduce_idx: 5
recover_idx: 11
feed_forward_expansion_factor: 4
input_dropout_rate: 0.1
feed_forward_dropout_rate: 0.1
attention_dropout_rate: 0.1
cnn_module_kernel: 31
cnn_norm_type: batch_norm
adaptive_scale: true
normalize_before: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# dataset related
dataset_conf:
syncbn: true
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.0005
max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 100
max_f: 27
max_w: 80
# warp_for_time: true
spec_sub: true
spec_sub_conf:
num_t_sub: 3
max_t: 30
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 4
max_epoch: 120
log_interval: 100
optim: adamw
optim_conf:
lr: 1.e-3
weight_decay: 4.e-5
scheduler: NoamHoldAnnealing
scheduler_conf:
warmup_ratio: 0.2
hold_ratio: 0.3
max_steps: 87960
decay_rate: 1.0
min_lr: 1.e-5
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 400
token_min_length: 1
# min_output_input_ratio: 0.0005
# max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
warp_for_time: true
spec_sub: true
spec_sub_conf:
num_t_sub: 3
max_t: 30
shuffle: true
shuffle_conf:
shuffle_size: 10000
sort: true
sort_conf:
sort_size: 2000 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 24
grad_clip: 5
accum_grad: 1
max_epoch: 120
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: squeezeformer
encoder_conf:
encoder_dim: 256
output_size: 256 # dimension of attention
attention_heads: 4
num_blocks: 12 # the number of encoder blocks
reduce_idx: 5
recover_idx: 11
time_reduction_layer_type: "stream"
feed_forward_expansion_factor: 8
input_dropout_rate: 0.1
feed_forward_dropout_rate: 0.1
attention_dropout_rate: 0.1
cnn_module_kernel: 31
do_rel_shift: false
cnn_norm_type: layer_norm
adaptive_scale: true
normalize_before: false
causal: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 400
token_min_length: 1
# min_output_input_ratio: 0.0005
# max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
# warp_for_time: true
spec_sub: true
spec_sub_conf:
num_t_sub: 3
max_t: 30
shuffle: true
shuffle_conf:
shuffle_size: 10000
sort: true
sort_conf:
sort_size: 2000 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 2
max_epoch: 120
log_interval: 100
optim: adamw
optim_conf:
lr: 8.e-4
weight_decay: 4.e-5
scheduler: NoamHoldAnnealing
scheduler_conf:
warmup_ratio: 0.2
hold_ratio: 0.3
max_steps: 175680
decay_rate: 1.0
min_lr: 1.e-5
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# dataset related
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.0005
max_output_input_ratio: 0.1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 120
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/bin/bash
# Copyright 2014 Vassil Panayotov
# 2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
exit 1
fi
src=$1
dst=$2
# all utterances are FLAC compressed
if ! which flac >&/dev/null; then
echo "Please install 'flac' on ALL worker nodes!"
exit 1
fi
mkdir -p $dst || exit 1
[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
reader=$(basename $reader_dir)
if ! [ $reader -eq $reader ]; then # not integer.
echo "$0: unexpected subdirectory name $reader"
exit 1
fi
for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
chapter=$(basename $chapter_dir)
if ! [ "$chapter" -eq "$chapter" ]; then
echo "$0: unexpected chapter-subdirectory name $chapter"
exit 1
fi
find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1
chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
[ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
cat $chapter_trans >>$trans
done
done
echo "$0: successfully prepared data in $dst"
exit 0
#!/bin/bash
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
remove_archive=false
if [ "$1" == --remove-archive ]; then
remove_archive=true
shift
fi
if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
echo " train-clean-100, train-clean-360, train-other-500."
exit 1
fi
data=$1
url=$2
part=$3
if [ ! -d "$data" ]; then
echo "$0: no such directory $data"
exit 1
fi
part_ok=false
list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
exit 1
fi
if [ -z "$url" ]; then
echo "$0: empty URL base."
exit 1
fi
if [ -f $data/LibriSpeech/$part/.complete ]; then
echo "$0: data part $part was already successfully extracted, nothing to do."
exit 0
fi
# sizes of the archive files in bytes. This is some older versions.
sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
# sizes_new is the archive file sizes of the final release. Some of these sizes are of
# things we probably won't download.
sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
if [ -f $data/$part.tar.gz ]; then
size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
size_ok=false
for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
if ! $size_ok; then
echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
echo "does not equal the size of one of the archives."
rm $data/$part.tar.gz
else
echo "$data/$part.tar.gz exists and appears to be complete."
fi
fi
if [ ! -f $data/$part.tar.gz ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1
fi
full_url=$url/$part.tar.gz
echo "$0: downloading data from $full_url. This may take some time, please be patient."
if ! wget -P $data --no-check-certificate $full_url; then
echo "$0: error executing wget $full_url"
exit 1
fi
fi
if ! tar -C $data -xvzf $data/$part.tar.gz; then
echo "$0: error un-tarring archive $data/$part.tar.gz"
exit 1
fi
touch $data/LibriSpeech/$part/.complete
echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
if $remove_archive; then
echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
rm $data/$part.tar.gz
fi
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# data
data_url=www.openslr.org/resources/12
# use your own data path
datadir=/export/data/en-asr-data/OpenSLR
# wav data dir
wave_data=data
# Optional train_config
# 1. conf/train_transformer_large.yaml: Standard transformer
train_config=conf/train_conformer.yaml
checkpoint=
cmvn=true
do_delta=false
dir=exp/sp_spec_aug
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
# maybe you can try to adjust it if you can not get close results as README.md
average_num=10
decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
. tools/parse_options.sh || exit 1;
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
set -e
set -u
set -o pipefail
train_set=train_960
dev_set=dev
recog_set="test_clean test_other dev_clean dev_other"
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
local/download_and_untar.sh ${datadir} ${data_url} ${part}
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: Data preparation"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
# use underscore-separated names in data directories.
local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_}
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
mkdir -p $wave_data/train_960
# merge total training data
for set in train_clean_100 train_clean_360 train_other_500; do
for f in `ls $wave_data/$set`; do
cat $wave_data/$set/$f >> $wave_data/train_960/$f
done
done
mkdir -p $wave_data/dev
# merge total dev data
for set in dev_clean dev_other; do
for f in `ls $wave_data/$set`; do
cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
done
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp $wave_data/$train_set/wav.scp \
--out_cmvn $wave_data/$train_set/global_cmvn
fi
dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_char/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in $dev_set ${recog_set} $train_set ; do
tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
$wave_data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
cmvn_opts=
$cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type raw \
--symbol_table $dict \
--bpe_model ${bpemodel}.model \
--train_data $wave_data/$train_set/data.list \
--cv_data $wave_data/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $num_gpus \
--ddp.rank $i \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# TODO, Add model average here
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
# Polling GPU id begin with index 0
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
idx=0
for test in $recog_set; do
for mode in ${decode_modes}; do
{
{
test_dir=$dir/${test}_${mode}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type raw \
--dict $dict \
--bpe_model ${bpemodel}.model \
--test_data $wave_data/$test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--result_file $test_dir/text_bpe \
--ctc_weight $ctc_weight \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
python tools/compute-wer.py --char=1 --v=1 \
$wave_data/$test/text $test_dir/text > $test_dir/wer
} &
((idx+=1))
if [ $idx -eq $num_gpus ]; then
idx=0
fi
}
done
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
lm=data/local/lm
lexicon=data/local/dict/lexicon.txt
mkdir -p $lm
mkdir -p data/local/dict
# 7.1 Download & format LM
which_lm=3-gram.pruned.1e-7.arpa.gz
if [ ! -e ${lm}/${which_lm} ]; then
wget http://www.openslr.org/resources/11/${which_lm} -P ${lm}
fi
echo "unzip lm($which_lm)..."
gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa
echo "Lm saved as ${lm}/lm.arpa"
# 7.2 Prepare dict
unit_file=$dict
bpemodel=$bpemodel
# use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel)
# if you download pretrained librispeech conformer model
cp $unit_file data/local/dict/units.txt
if [ ! -e ${lm}/librispeech-lexicon.txt ]; then
wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm}
fi
echo "build lexicon..."
tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \
$lexicon $bpemodel.model
echo "lexicon saved as '$lexicon'"
# 7.3 Build decoding TLG
tools/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
# 7.4 Decoding with runtime
fst_dir=data/lang_test
for test in ${recog_set}; do
./tools/decode.sh --nj 6 \
--beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \
--ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \
--fst_path $fst_dir/TLG.fst \
--dict_path $fst_dir/words.txt \
data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \
$dir/lm_with_runtime_${test}
tail $dir/lm_with_runtime_${test}/wer
done
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment