Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def get_args():
parser = argparse.ArgumentParser(description='sum up prediction wer')
parser.add_argument('--job_num', type=int, default=8,
help='number of total split dir')
parser.add_argument('--dir_split', required=True,
help='the path to the data_list dir '
'eg data/train/wenet1k_good_split_60/')
parser.add_argument('--label', type=int, default=0,
help='if ture, label file will also be considered.')
parser.add_argument('--hypo_name', type=str, required=True,
help='the hypothesis path. eg. /hypothesis_0.txt ')
parser.add_argument('--wav_dir', type=str, required=True,
help='the wav dir path. eg. data/train/wenet_1k_untar/ ')
args = parser.parse_args()
return args
def main():
args = get_args()
data_list_dir = args.dir_split
num_lists = args.job_num
hypo = args.hypo_name
# wav_dir is the directory where your pair of ID.scp
# (the audio file ) and ID.txt (the optional label file ) file stored.
# We assumed that you have generated this dir in data processing steps.
wav_dir = args.wav_dir
label = args.label
print("data_list_path is", data_list_dir)
print("num_lists is", num_lists)
print("hypo is", hypo)
print("wav_dir is", wav_dir)
i = num_lists
c = 0
hypo_path = data_list_dir + "data_sublist" + str(i) + hypo
output_wav = data_list_dir + "data_sublist" + str(i) + "/wav.scp"
output_label = data_list_dir + "data_sublist" + str(i) + "/label.txt"
# bad lines are just for debugging
output_bad_lines = data_list_dir + "data_sublist" + str(i) + "/bad_line.txt"
with open(hypo_path, 'r', encoding="utf-8") as reader:
hypo_lines = reader.readlines()
wavs = []
labels = []
bad_files = []
for x in hypo_lines:
c += 1
file_id = x.split()[0]
label_path = wav_dir + file_id + ".txt"
wav_path = wav_dir + file_id + ".wav\n"
wav_line = file_id + " " + wav_path
wavs.append(wav_line)
if label:
try:
with open(label_path, 'r', encoding="utf-8") as reader1:
label_line = reader1.readline()
except OSError as e:
bad_files.append(label_path)
label_line = file_id + " " + label_line + "\n"
labels.append(label_line)
with open(output_wav, 'w', encoding="utf-8") as writer2:
for wav in wavs:
writer2.write(wav)
with open(output_bad_lines, 'w', encoding="utf-8") as writer4:
for line in bad_files:
writer4.write(line)
if label:
with open(output_label, 'w', encoding="utf-8") as writer3:
for label in labels:
writer3.write(label)
if __name__ == '__main__':
main()
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
def get_args():
parser = argparse.ArgumentParser(description='')
parser.add_argument('--job_nums', type=int, default=8,
help='number of total split jobs')
parser.add_argument('--data_list_path', required=True,
help='the path to the data.list file')
parser.add_argument('--output_dir', required=True,
help='path to output dir, '
'eg --output_dir=data/train/aishell_split_60')
args = parser.parse_args()
return args
def main():
args = get_args()
data_list_path = args.data_list_path
num_lists = args.job_nums
output_dir = args.output_dir
print("data_list_path is", data_list_path)
print("num_lists is", num_lists)
print("output_dir is", output_dir)
os.makedirs(output_dir, exist_ok=True)
with open(data_list_path, 'r', encoding="utf-8") as reader:
data_list_we = reader.readlines()
# divide data.list equally
len_d = int(len(data_list_we) / num_lists)
rest_lines = data_list_we[num_lists * len_d:]
rest_len = len(rest_lines)
print("total num of lines", len(data_list_we) , "rest len is", rest_len)
# generate N sublist
for i in range(num_lists):
print("current dir num", i)
out_put_sub_dir = output_dir + "/" + "data_sublist" + str(i) + "/"
os.makedirs(out_put_sub_dir, exist_ok=True)
output_list = out_put_sub_dir + "data_list"
with open(output_list, 'w', encoding="utf-8") as writer:
new_list = data_list_we[i * len_d: (i + 1) * len_d]
if i < rest_len:
new_list.append(rest_lines[i])
for x in new_list:
# output list
writer.write(x)
if __name__ == '__main__':
main()
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
iter_num=2
stage=1
stop_stage=1
pseudo_data_ratio=0.75
dir=exp/conformer_test_fully_supervised
data_list=data_aishell.list
supervised_data_list=data_aishell.list
unsupervised_data_list=wenet_1khr.list
dir_split=wenet_split_60_test/
out_data_list=data/train/wenet_1khr_nst0.list
num_split=1
. tools/parse_options.sh || exit 1;
# Stage 1 trains the initial teacher and generates initial pseudo-labels.
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "******** stage 1 training the intial teacher ********"
bash run_nst.sh --dir $dir \
--data_list $data_list \
--supervised_data_list $supervised_data_list \
--unsupervised_data_list $unsupervised_data_list \
--dir_split $dir_split\
--out_data_list $out_data_list \
--enable_nst 0 \
--pseudo_data_ratio pseudo_data_ratio \
--num_split $num_split
fi
# Stage 2 trains the nst iterations.
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for ((i = 0; i < $iter_num; ++i)); do
{
echo "******** stage 2 training nst iteration number $i ********"
bash run_nst.sh --dir exp/conformer_nst${i+1} \
--supervised_data_list data_aishell.list \
--data_list wenet_1khr_nst${i}.list \
--enable_nst 1 \
--job_num 0 \
--num_split $num_split \
--hypo_name hypothesis_nst${i+1}.txt \
--untar_dir wenet_1khr_untar_nst${i+1}/ \
--tar_dir wenet_1khr_tar_nst${i+1}/ \
--out_data_list wenet_1khr_nst${i+1}.list \
--pseudo_data_ratio $pseudo_data_ratio
}
done
fi
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is an augmented version of aishell-1 "run.sh" to make the code compatible with noisy student training
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=1 # start from 0 if you need to start from data preparation
stop_stage=8
# here are extra parameters used in NST
cer_out_dir=""
dir=""
supervised_data_list=""
checkpoint=
unsupervised_data_list=""
data_list=""
hypo_name=""
out_data_list=""
#parameters with default values:
label=0
average_num=30
nj=16
num_split=1
cer_hypo_threshold=10
speak_rate_threshold=0
label_file="label.txt"
utter_time_file="utter_time.json"
enable_nst=1
job_num=0
dir_split="wenet_split_60_test/"
hypo_name="hypothesis_nst${job_num}.txt"
wav_dir="data/train/wenet_1k_untar/"
tar_dir="data/train/wenet_1khr_tar/"
untar_dir="data/train/wenet_1khr_untar/"
cer_hypo_dir="wenet_cer_hypo"
cer_label_dir="wenet_cer_label"
pseudo_data_ratio=0.75
# The num of machines(nodes) for multi-machine training, 1 is for one machine.
# NFS is required if num_nodes > 1.
num_nodes=1
# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
# You should set the node_ranHk=0 on the first machine, set the node_rank=1
# on the second machine, and so on.
node_rank=0
dict=data/dict/lang_char.txt
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=shard
num_utts_per_shard=1000
train_set=train
train_config=conf/train_conformer.yaml
cmvn=true
average_checkpoint=true
target_pt=80
decode_checkpoint=$dir/$target_pt.pt
# here we only use attention_rescoring for NST
decode_modes="attention_rescoring"
. tools/parse_options.sh || exit 1;
# print the settings
echo "setting for this run:"
echo "dir is ${dir}"
echo "data list is ${data_list}"
echo "job_num is ${job_num}"
echo "cer_out_dir is ${cer_out_dir}"
echo "average_num is ${average_num}"
echo "checkpoint is ${checkpoint} "
echo "enable_nst is ${enable_nst} "
# we assumed that you have finished the data pre-process steps from -1 to 3 in aishell1/s0/run.sh .
# You can modify the "--train_data_supervised" to match your supervised data list.
# Here i used wenetspeech as the unsupervised data, you can run the data pre-process steps from -1 to 3 in
# wenetspeech/s0/run.sh ; you can modify "--train_data_supervised" to match your unsupervised data list.
# you can follow this process to generate your own dataset.
# I have also included my code for extracting data in local/...
# stage 1 is for training
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "********step 1 start time : $now ********"
mkdir -p $dir
# You have to rm `INIT_FILE` manually when you resume or restart a
# multi-machine training.
rm $dir/ddp_init
INIT_FILE=$dir/ddp_init
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
# the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
# and it should be positioned at data/${train_set}/global_cmvn .
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py rewrite $train_config to $dir/train.yaml with model input
# and output dimension, and $dir/train.yaml will be used for inference
# and export.
echo "checkpoint is " ${checkpoint}
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
echo "gpu number $i "
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data data/$train_set/$data_list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
# In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy
# please make sure your test and valid data.list are in the proper location.
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Test model, please specify the model you want to test by --checkpoint
# stage 5 we test with aishell dataset,
echo "******** step 2 start time : $now ********"
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# export model
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip \
--output_quant_file $dir/final_quant.zip
# Please specify decoding_chunk_size for unified streaming and
# non-streaming model. The default value is -1, which is full chunk
# for non-streaming inference.
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
# test_wer
for mode in ${decode_modes}; do
{
#test_dir=$dir/test_${mode}_${target_pt}pt # for target pt
test_dir=$dir/test_${mode}${average_num}pt # for average pt
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
echo "before compute-wer"
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
done
# dev_wer
for mode in ${decode_modes}; do
{
#test_dir=$dir/test_${mode}_${target_pt}pt # for target pt
dev_dir=$dir/dev_${mode}${average_num}pt # for average pt
mkdir -p $dev_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/dev/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $dev_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
echo "before compute-wer"
python tools/compute-wer.py --char=1 --v=1 \
data/dev/text $dev_dir/text > $dev_dir/wer
} &
done
wait
fi
# split the (unsupervised) datalist into N sublists, where N depends on the number of available cpu in your cluster.
# when making inference, we compute N sublist in parallel.
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ ${enable_nst} -eq 0 ]; then
echo "********step 3 start time : $now ********"
python local/split_data_list.py \
--job_nums $num_split \
--data_list_path data/train/$unsupervised_data_list \
--output_dir data/train/$dir_split
fi
# stage 4 will perform inference without language model on the given sublist(job num)
# here is example usages:
# bash run_nst.sh --stage 4 --stop-stage 4 --job_num $i --dir_split data/train/wenet_4khr_split_60/
# --hypo_name hypothesis_0.txt --dir exp/conformer_aishell2_wenet4k_nst4
# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# For each gpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "********step 4 start time : $now ********"
# we assume you have run stage 2 so that avg_${average_num}.pt exists
decode_checkpoint=$dir/avg_${average_num}.pt
# Please specify decoding_chunk_size for unified streaming and
# non-streaming model. The default value is -1, which is full chunk
# for non-streaming inference.
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
mode="attention_rescoring"
gpu_id=0
echo "job number ${job_num} "
echo "data_list dir is ${dir_split}"
echo "hypo name is " $hypo_name
echo "dir is ${dir}"
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/train/${dir_split}data_sublist${job_num}/data_list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
echo "end time : $now"
fi
# Generate wav.scp file and label.txt file(optional) for each sublist we generated in step 3.
# the wav_dir should be prepared in data processing step as we mentioned.
#You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data,
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# wav_dir is the directory that stores raw wav file and possible labels.
# if you have label for unsupervised dataset, set label = 1 other wise keep it 0
# For each gpu or cpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && [ ${enable_nst} -eq 0 ]; then
echo "********step 5 start time : $now ********"
python local/get_wav_labels.py \
--dir_split data/train/${dir_split} \
--hypo_name /$hypo_name \
--wav_dir $wav_dir\
--job_num $job_num \
--label $label
fi
# Calculate cer-hypo between hypothesis with and without language model.
# We assumed that you have finished language model
# training using the wenet aishell-1 pipline. (You should have data/lang/words.txt , data/lang/TLG.fst files ready.)
# Here is an exmaple usage:
# bash run_nst.sh --stage 5 --stop-stage 5 --job_num n --dir_split data/train/wenet1k_redo_split_60/
# --cer_hypo_dir wenet1k_cer_hypo --hypo_name hypothesis_nst.txt --dir exp/conformer_no_filter_redo_nst6
# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# For each gpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "********step 6 start time : $now ********"
chunk_size=-1
mode="attention_rescoring"
test_dir=$dir/test_${mode}_${job_num}
now=$(date +"%T")
echo "start time : $now"
echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
echo "nj is" $nj "hypo_file is" $hypo_name "cer out is" $cer_hypo_dir "lm is 4gram"
echo "dir is " $dir
if [ ! -f data/train/${dir_split}data_sublist${job_num}/${hypo_name} ]; then
echo "text file does not exists"
exit 1;
fi
./tools/decode.sh --nj 16 \
--beam 15.0 --lattice_beam 7.5 --max_active 7000 \
--blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
--chunk_size $chunk_size \
--fst_path data/lang_test/TLG.fst \
data/train/${dir_split}data_sublist${job_num}/wav.scp \
data/train/${dir_split}data_sublist${job_num}/${hypo_name} $dir/final.zip \
data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_hypo_dir}_${job_num}
now=$(date +"%T")
echo "end time : $now"
fi
# (optional, only run this stage if you have true label for unsupervised data.)
# Calculate cer-label between true label and hypothesis with language model.
# You can use the output cer to evaluate NST's performance.
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && [ ${label} -eq 1 ]; then
echo "********step 7 start time : $now ********"
chunk_size=-1
mode="attention_rescoring"
test_dir=$dir/test_${mode}_${job_num}
now=$(date +"%T")
echo "start time : $now"
echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
echo "nj is" $nj "label_file is" $label_file "cer out is" $cer_label_dir "lm is 4gram"
echo "dir is " $dir
echo "label_file " data/train/${dir_split}data_sublist${job_num}/${label_file}
if [ ! -f data/train/${dir_split}data_sublist${job_num}/${label_file} ]; then
echo "text file does not exists"
exit 1;
fi
./tools/decode.sh --nj 16 \
--beam 15.0 --lattice_beam 7.5 --max_active 7000 \
--blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
--chunk_size $chunk_size \
--fst_path data/lang_test/TLG.fst \
data/train/${dir_split}data_sublist${job_num}/wav.scp \
data/train/${dir_split}data_sublist${job_num}/${label_file} $dir/final.zip \
data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_label_dir}_${job_num}
now=$(date +"%T")
echo "end time : $now"
fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
echo "********step 8 start time : $now ********"
python local/generate_filtered_pseudo_label.py \
--cer_hypo_dir $cer_hypo_dir \
--untar_dir data/train/$untar_dir \
--wav_dir $wav_dir \
--dir_num $job_num \
--cer_hypo_threshold $cer_hypo_threshold \
--speak_rate_threshold $speak_rate_threshold \
--dir $dir \
--tar_dir data/train/$tar_dir \
--utter_time_file $utter_time_file
python local/generate_data_list.py \
--tar_dir data/train/$tar_dir \
--out_data_list data/train/$out_data_list \
--supervised_data_list data/train/$supervised_data_list \
--pseudo_data_ratio $pseudo_data_ratio
fi
# Performance Record
## Conformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 100 epochs, dither 0.1
* Training weight info: transducer_weight 0.75, ctc_weight 0.1, attention_weight 0.15, average_num 10
* Predictor type: lstm
| decoding mode | CER |
|---------------------------|-------|
| rnnt greedy search | 5.24 |
* after 165 epochs and avg 30
| decoding mode | CER |
|---------------------------|-------|
| rnnt greedy search | 5.02 |
| ctc prefix beam search | 5.17 |
| ctc prefix beam + rescore | 4.48 |
## Conformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 140 epochs, dither 0.1
* Training weight info: transducer_weight 0.4, ctc_weight 0.2, attention_weight 0.4, average_num 10
* Predictor type: lstm
* Model link: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20220728_conformer_rnnt_exp.tar.gz
| decoding mode | CER |
|---------------------------------------|-------|
| rnnt greedy search | 4.88 |
| rnnt beam search | 4.67 |
| ctc prefix beam search | 5.02 |
| ctc prefix beam + rescore | 4.51 |
| ctc prefix beam + rnnt&attn rescore | 4.45 |
| rnnt prefix beam + rnnt&attn rescore | 4.49 |
## U2++ Conformer Result
* Feature info: using fbank feature, dither, cmvn, oneline speed perturb
* Training info: lr 0.001, batch size 4, 32 gpu, acc_grad 1, 360 epochs
* Training weight info: transducer_weight 0.75, ctc_weight 0.1, reverse_weight 0.15 average_num 30
* Predictor type: lstm
| decoding mode/chunk size | full | 16 |
|---------------------------|-------|-------|
| rnnt greedy search | 5.68 | 6.26 |
## Pretrain
* Pretrain model: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_exp.tar.gz
* Feature info: using fbank feature, dither, cmvn, oneline speed perturb
* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 140 epochs
* Training weight info: transducer_weight 0.4, ctc_weight 0.2 , attention_weight 0.4, reverse_weight 0.3 average_num 30
* Predictor type: lstm
| decoding mode/chunk size | full | 16 |
|-----------------------------|-------|--------|
| rnnt greedy search | 5.21 | 5.73 |
| rnnt prefix beam | 5.14 | 5.63 |
| rnnt prefix beam + rescore | 4.73 | 5.095 |
## Training loss ablation study
note:
- If rnnt is checked, greedy means rnnt greedy search; so is beam
- if rnnt is checked, rescoring means rnnt beam & attention rescoring
- if only 'ctc & att' is checked, greedy means ctc gredy search; so is beam
- if only 'ctc & att' (AED) is checked, rescoring means ctc beam & attention rescoring
- what if rnnt model do search of wenet's style, comming soon
| rnnt | ctc | att | greedy | beam | rescoring | fusion |
|------|-----|-----|--------|------|-----------|--------|
| ✔ | ✔ | ✔ | 4.88 | 4.67 | 4.45 | 4.49 |
| ✔ | ✔ | | 5.56 | 5.46 | / | 5.40 |
| ✔ | | ✔ | 5.03 | 4.94 | 4.87 | / |
| ✔ | | | 5.64 | 5.59 | / | / |
| | ✔ | ✔ | 4.94 | 4.94 | 4.61 | / |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: true
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
joint_conf:
join_dim: 512
prejoin_linear: True
postjoin_linear: false
joint_mode: 'add'
activation: 'tanh'
predictor: rnn
predictor_conf:
embed_size: 256
output_size: 256
embed_dropout: 0.1
hidden_size: 256
num_layers: 2
bias: true
rnn_type: 'lstm'
dropout: 0.1
decoder: bitransformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 3
positional_dropout_rate: 0.1
r_num_blocks: 3
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid transducer+ctc+attention
model_conf:
transducer_weight: 0.75
ctc_weight: 0.1
attention_weight: 0.15
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 40960
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 8
grad_clip: 4
accum_grad: 1
max_epoch: 140
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
joint_conf:
join_dim: 512
prejoin_linear: True
postjoin_linear: false
joint_mode: 'add'
activation: 'tanh'
predictor: rnn
predictor_conf:
embed_size: 256
output_size: 256
embed_dropout: 0.1
hidden_size: 256
num_layers: 2
bias: true
rnn_type: 'lstm'
dropout: 0.1
decoder: bitransformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 3
positional_dropout_rate: 0.1
r_num_blocks: 3
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid transducer+ctc+attention
model_conf:
transducer_weight: 0.75
ctc_weight: 0.1
attention_weight: 0.15
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 40960
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 4
grad_clip: 4
accum_grad: 1
max_epoch: 130
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: true
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
joint_conf:
join_dim: 320
prejoin_linear: true
postjoin_linear: false
joint_mode: 'add'
activation: 'tanh'
predictor: embedding
predictor_conf:
embed_size: 320
embed_dropout: 0.1
n_head: 4
history_size: 5
bias: false
decoder: bitransformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 3
positional_dropout_rate: 0.1
r_num_blocks: 3
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid transducer+ctc+attention
model_conf:
transducer_weight: 0.4
ctc_weight: 0.2
attention_weight: 0.4
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 40960
min_length: 10
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 30
grad_clip: 4
accum_grad: 1
max_epoch: 500
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
../s0/local
\ No newline at end of file
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
# 2022 Binbin Zhang(binbizha@qq.com)
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# The num of machines(nodes) for multi-machine training, 1 is for one machine.
# NFS is required if num_nodes > 1.
num_nodes=1
# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
# You should set the node_rank=0 on the first machine, set the node_rank=1
# on the second machine, and so on.
node_rank=0
# The aishell dataset location, please change this to your own path
# make sure of using absolute path. DO-NOT-USE relatvie path!
data=/export/data/asr-data/OpenSLR/33/
data_url=www.openslr.org/resources/33
nj=16
dict=data/dict/lang_char.txt
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=raw
num_utts_per_shard=1000
train_set=train
train_config=conf/conformer_u2pp_rnnt.yaml
cmvn=true
dir=exp/conformer_rnnt
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="rnnt_beam_search"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
local/download_and_untar.sh ${data} ${data_url} data_aishell
local/download_and_untar.sh ${data} ${data_url} resource_aishell
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/aishell_data_prep.sh ${data}/data_aishell/wav \
${data}/data_aishell/transcript
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# remove the space between the text labels for Mandarin dataset
for x in train dev test; do
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -f 1 -d" " data/${x}/text.org) \
<(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
> data/${x}/text
rm data/${x}/text.org
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/$train_set/global_cmvn
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 is for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
| tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \
awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare data, prepare required format"
for x in dev test ${train_set}; do
if [ $data_type == "shard" ]; then
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 data/$x/wav.scp data/$x/text \
$(realpath data/$x/shards) data/$x/data.list
else
tools/make_raw_list.py data/$x/wav.scp data/$x/text \
data/$x/data.list
fi
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
mkdir -p $dir
# You have to rm `INIT_FILE` manually when you resume or restart a
# multi-machine training.
INIT_FILE=$dir/ddp_init
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py rewrite $train_config to $dir/train.yaml with model input
# and output dimension, and $dir/train.yaml will be used for inference
# and export.
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Please specify decoding_chunk_size for unified streaming and
# non-streaming model. The default value is -1, which is full chunk
# for non-streaming inference.
decoding_chunk_size=
# only used in rescore mode for weighting different scores
rescore_ctc_weight=0.5
rescore_transducer_weight=0.5
rescore_attn_weight=0.5
# only used in beam search, either pure beam search mode OR beam search inside rescoring
search_ctc_weight=0.3
search_transducer_weight=0.7
reverse_weight=0.0
for mode in ${decode_modes}; do
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $rescore_ctc_weight \
--transducer_weight $rescore_transducer_weight \
--attn_weight $rescore_attn_weight \
--search_ctc_weight $search_ctc_weight \
--search_transducer_weight $search_transducer_weight \
--reverse_weight $reverse_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
done
wait
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.002, batch size 18, 4 gpu, acc_grad 4, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
| decoding mode | CER |
|---------------------------|-------|
| attention decoder | 5.18 |
| ctc greedy search | 4.94 |
| ctc prefix beam search | 4.94 |
| attention rescoring | 4.61 |
| LM + attention rescoring | 4.36 |
## U2++ Conformer Result
* Feature info: using fbank feature, dither=1.0, cmvn, oneline speed perturb
* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 360 epochs
* Decoding info: ctc_weight 0.3, reverse_weight 0.5 average_num 30, lm_scale 0.7, decoder_scale 0.1, r_decoder_scale 0.7
* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
| decoding mode/chunk size | full | 16 |
|---------------------------|-------|-------|
| ctc greedy search | 5.19 | 5.81 |
| ctc prefix beam search | 5.17 | 5.81 |
| attention rescoring | 4.63 | 5.05 |
| LM + attention rescoring | 4.40 | 4.75 |
| HLG(k2 LM) | 4.81 | 5.27 |
| HLG(k2 LM) + attention rescoring | 4.32 | 4.70 |
## Unified Conformer Result
* Feature info: using fbank feature, dither=0, cmvn, oneline speed perturb
* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 180 epochs, dither 0.0
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
| decoding mode/chunk size | full | 16 | 8 | 4 |
|---------------------------|-------|-------|-------|-------|
| attention decoder | 5.40 | 5.60 | 5.74 | 5.86 |
| ctc greedy search | 5.56 | 6.29 | 6.68 | 7.10 |
| ctc prefix beam search | 5.57 | 6.30 | 6.67 | 7.10 |
| attention rescoring | 5.05 | 5.45 | 5.69 | 5.91 |
| LM + attention rescoring | 4.73 | 5.08 | 5.22 | 5.38 |
## U2++ Transformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb.
* Training info: lr 0.001, batch size 26, 8 gpu, acc_grad 1, 360 epochs, dither 0.1
* Decoding info: ctc_weight 0.2, reverse_weight 0.5, average_num 30
* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
| decoding mode/chunk size | full | 16 |
|---------------------------|-------|-------|
| ctc greedy search | 6.05 | 6.92 |
| ctc prefix beam search | 6.05 | 6.90 |
| attention rescoring | 5.11 | 5.63 |
| LM + attention rescoring | 4.82 | 5.24 |
## Transformer Result
* Feature info: using fbank feature, dither, with cmvn, online speed perturb.
* Training info: lr 0.002, batch size 26, 4 gpu, acc_grad 4, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
| decoding mode | CER |
|---------------------------|-------|
| attention decoder | 5.69 |
| ctc greedy search | 5.92 |
| ctc prefix beam search | 5.91 |
| attention rescoring | 5.30 |
| LM + attention rescoring | 5.04 |
## Unified Transformer Result
* Feature info: using fbank feature, dither=0, with cmvn, online speed perturb.
* Training info: lr 0.002, batch size 16, 4 gpu, acc_grad 1, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
| decoding mode/chunk size | full | 16 | 8 | 4 |
|---------------------------|-------|-------|-------|-------|
| attention decoder | 6.04 | 6.35 | 6.45 | 6.70 |
| ctc greedy search | 6.28 | 6.99 | 7.39 | 7.89 |
| ctc prefix beam search | 6.28 | 6.98 | 7.40 | 7.89 |
| attention rescoring | 5.52 | 6.05 | 6.28 | 6.62 |
| LM + attention rescoring | 5.11 | 5.59 | 5.86 | 6.17 |
## AMP Training Transformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.002, batch size, 4 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 25000
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 1bb4e5a269c535340fae5b0739482fa47733d2c1
| decoding mode | CER |
|------------------------|------|
| attention decoder | 5.73 |
| ctc greedy search | 5.92 |
| ctc prefix beam search | 5.92 |
| attention rescoring | 5.31 |
## Muilti-machines Training Conformer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.004, batch size 16, 2 machines, 8\*2=16 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 10000
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: f6b1409023440da1998d31abbcc3826dd40aaf35
| decoding mode | CER |
|------------------------|------|
| attention decoder | 4.90 |
| ctc greedy search | 5.07 |
| ctc prefix beam search | 5.06 |
| attention rescoring | 4.65 |
## Conformer with/without Position Encoding Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.002, batch size 16, 8 gpu, acc_grad 4, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 20
| decoding mode | with PE | without PE |
|------------------------|---------|------------|
| attention decoder | 5.18 | 5.73 |
| ctc greedy search | 4.94 | 4.97 |
| ctc prefix beam search | 4.94 | 4.97 |
| attention rescoring | 4.61 | 4.69 |
## Efficient Conformer v1 Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 200 epochs
* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 20
| decoding mode | full | 18 | 16 |
|------------------------|------|------|------|
| attention decoder | 4.99 | 5.13 | 5.16 |
| ctc prefix beam search | 4.98 | 5.23 | 5.23 |
| attention rescoring | 4.64 | 4.86 | 4.85 |
## Efficient Conformer v2 Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 200 epochs
* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 20
| decoding mode | full | 18 | 16 |
|------------------------|------|------|------|
| attention decoder | 4.87 | 5.03 | 5.07 |
| ctc prefix beam search | 4.97 | 5.18 | 5.20 |
| attention rescoring | 4.56 | 4.75 | 4.77 |
# Benchmark on Conformer
| IO | CER |
|--------------|-------|
| Old | 4.61 |
| UIO(Raw) | 4.63 |
| UIO(Shards) | 4.67 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'no_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 26
grad_clip: 5
accum_grad: 1
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
spec_sub: true
spec_sub_conf:
num_t_sub: 3
max_t: 30
spec_trim: false
spec_trim_conf:
max_t: 50
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 360
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment