add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/aishell/NST/local/get_wav_labels.py
+++ b/examples/aishell/NST/local/get_wav_labels.py
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='sum up prediction wer')
+    parser.add_argument('--job_num', type=int, default=8,
+                        help='number of total split dir')
+    parser.add_argument('--dir_split', required=True,
+                        help='the path to the data_list dir '
+                             'eg data/train/wenet1k_good_split_60/')
+    parser.add_argument('--label', type=int, default=0,
+                        help='if ture, label file will also be considered.')
+    parser.add_argument('--hypo_name', type=str, required=True,
+                        help='the hypothesis path.  eg. /hypothesis_0.txt ')
+    parser.add_argument('--wav_dir', type=str, required=True,
+                        help='the wav dir path.  eg. data/train/wenet_1k_untar/ ')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    data_list_dir = args.dir_split
+    num_lists = args.job_num
+    hypo = args.hypo_name
+    # wav_dir is the directory where your pair of ID.scp
+    # (the audio file ) and ID.txt (the optional label file ) file stored.
+    # We assumed that you have generated this dir in data processing steps.
+    wav_dir = args.wav_dir
+    label = args.label
+
+    print("data_list_path is", data_list_dir)
+    print("num_lists is", num_lists)
+    print("hypo is", hypo)
+    print("wav_dir is", wav_dir)
+
+    i = num_lists
+    c = 0
+    hypo_path = data_list_dir + "data_sublist" + str(i) + hypo
+    output_wav = data_list_dir + "data_sublist" + str(i) + "/wav.scp"
+    output_label = data_list_dir + "data_sublist" + str(i) + "/label.txt"
+    # bad lines are just for debugging
+    output_bad_lines = data_list_dir + "data_sublist" + str(i) + "/bad_line.txt"
+
+    with open(hypo_path, 'r', encoding="utf-8") as reader:
+        hypo_lines = reader.readlines()
+
+    wavs = []
+    labels = []
+    bad_files = []
+    for x in hypo_lines:
+        c += 1
+        file_id = x.split()[0]
+
+        label_path = wav_dir + file_id + ".txt"
+        wav_path = wav_dir + file_id + ".wav\n"
+        wav_line = file_id + " " + wav_path
+        wavs.append(wav_line)
+        if label:
+            try:
+                with open(label_path, 'r', encoding="utf-8") as reader1:
+                    label_line = reader1.readline()
+            except OSError as e:
+                bad_files.append(label_path)
+
+            label_line = file_id + " " + label_line + "\n"
+            labels.append(label_line)
+
+    with open(output_wav, 'w', encoding="utf-8") as writer2:
+        for wav in wavs:
+            writer2.write(wav)
+    with open(output_bad_lines, 'w', encoding="utf-8") as writer4:
+        for line in bad_files:
+            writer4.write(line)
+    if label:
+        with open(output_label, 'w', encoding="utf-8") as writer3:
+            for label in labels:
+                writer3.write(label)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/aishell/NST/local/split_data_list.py
+++ b/examples/aishell/NST/local/split_data_list.py
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--job_nums', type=int, default=8,
+                        help='number of total split jobs')
+    parser.add_argument('--data_list_path', required=True,
+                        help='the path to the data.list file')
+    parser.add_argument('--output_dir', required=True,
+                        help='path to output dir, '
+                             'eg --output_dir=data/train/aishell_split_60')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    data_list_path = args.data_list_path
+    num_lists = args.job_nums
+    output_dir = args.output_dir
+
+    print("data_list_path is", data_list_path)
+    print("num_lists is", num_lists)
+    print("output_dir is", output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    with open(data_list_path, 'r', encoding="utf-8") as reader:
+        data_list_we = reader.readlines()
+
+    # divide data.list equally
+    len_d = int(len(data_list_we) / num_lists)
+    rest_lines = data_list_we[num_lists * len_d:]
+    rest_len = len(rest_lines)
+    print("total num of lines", len(data_list_we) , "rest len is", rest_len)
+
+    # generate N sublist
+    for i in range(num_lists):
+        print("current dir num", i)
+        out_put_sub_dir = output_dir + "/" + "data_sublist" + str(i) + "/"
+        os.makedirs(out_put_sub_dir, exist_ok=True)
+        output_list = out_put_sub_dir + "data_list"
+
+        with open(output_list, 'w', encoding="utf-8") as writer:
+
+            new_list = data_list_we[i * len_d: (i + 1) * len_d]
+            if i < rest_len:
+                new_list.append(rest_lines[i])
+            for x in new_list:
+                # output list
+                writer.write(x)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/aishell/NST/path.sh
+++ b/examples/aishell/NST/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/aishell/NST/run.sh
+++ b/examples/aishell/NST/run.sh
+#!/bin/bash
+
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+iter_num=2
+stage=1
+stop_stage=1
+pseudo_data_ratio=0.75
+dir=exp/conformer_test_fully_supervised
+data_list=data_aishell.list
+supervised_data_list=data_aishell.list
+unsupervised_data_list=wenet_1khr.list
+dir_split=wenet_split_60_test/
+out_data_list=data/train/wenet_1khr_nst0.list
+num_split=1
+. tools/parse_options.sh || exit 1;
+
+# Stage 1 trains the initial teacher and generates initial pseudo-labels.
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "******** stage 1 training the intial teacher ********"
+  bash run_nst.sh --dir $dir \
+  --data_list $data_list \
+  --supervised_data_list $supervised_data_list \
+  --unsupervised_data_list $unsupervised_data_list \
+  --dir_split $dir_split\
+  --out_data_list $out_data_list \
+  --enable_nst 0 \
+  --pseudo_data_ratio pseudo_data_ratio \
+  --num_split $num_split
+
+fi
+
+# Stage 2 trains the nst iterations.
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+
+  for ((i = 0; i < $iter_num; ++i)); do
+  {
+    echo "******** stage 2 training nst iteration number $i ********"
+    bash run_nst.sh --dir exp/conformer_nst${i+1} \
+      --supervised_data_list data_aishell.list \
+      --data_list wenet_1khr_nst${i}.list \
+      --enable_nst 1 \
+      --job_num 0 \
+      --num_split $num_split \
+      --hypo_name hypothesis_nst${i+1}.txt \
+      --untar_dir wenet_1khr_untar_nst${i+1}/ \
+      --tar_dir wenet_1khr_tar_nst${i+1}/ \
+      --out_data_list wenet_1khr_nst${i+1}.list \
+      --pseudo_data_ratio $pseudo_data_ratio
+
+  }
+  done
+
+fi
--- a/examples/aishell/NST/run_nst.sh
+++ b/examples/aishell/NST/run_nst.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This is an augmented version of aishell-1 "run.sh" to make the code compatible with noisy student training
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=1 # start from 0 if you need to start from data preparation
+stop_stage=8
+
+# here are extra parameters used in NST
+cer_out_dir=""
+dir=""
+supervised_data_list=""
+checkpoint=
+unsupervised_data_list=""
+data_list=""
+
+hypo_name=""
+out_data_list=""
+#parameters with default values:
+label=0
+average_num=30
+nj=16
+num_split=1
+cer_hypo_threshold=10
+speak_rate_threshold=0
+label_file="label.txt"
+utter_time_file="utter_time.json"
+enable_nst=1
+job_num=0
+dir_split="wenet_split_60_test/"
+hypo_name="hypothesis_nst${job_num}.txt"
+wav_dir="data/train/wenet_1k_untar/"
+tar_dir="data/train/wenet_1khr_tar/"
+untar_dir="data/train/wenet_1khr_untar/"
+cer_hypo_dir="wenet_cer_hypo"
+cer_label_dir="wenet_cer_label"
+pseudo_data_ratio=0.75
+
+# The num of machines(nodes) for multi-machine training, 1 is for one machine.
+# NFS is required if num_nodes > 1.
+
+num_nodes=1
+
+# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
+# You should set the node_ranHk=0 on the first machine, set the node_rank=1
+# on the second machine, and so on.
+node_rank=0
+dict=data/dict/lang_char.txt
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=shard
+num_utts_per_shard=1000
+train_set=train
+train_config=conf/train_conformer.yaml
+cmvn=true
+average_checkpoint=true
+target_pt=80
+decode_checkpoint=$dir/$target_pt.pt
+
+# here we only use attention_rescoring for NST
+decode_modes="attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+# print the settings
+echo "setting for this run:"
+echo "dir is ${dir}"
+echo "data list is ${data_list}"
+echo "job_num is ${job_num}"
+echo "cer_out_dir is  ${cer_out_dir}"
+echo "average_num is ${average_num}"
+echo "checkpoint is ${checkpoint} "
+echo "enable_nst is ${enable_nst} "
+
+# we assumed that you have finished the data pre-process steps from -1 to 3 in aishell1/s0/run.sh .
+# You can modify the "--train_data_supervised" to match your supervised data list.
+# Here i used wenetspeech as the unsupervised data, you can run the data pre-process steps from -1 to 3 in
+# wenetspeech/s0/run.sh ; you can modify "--train_data_supervised" to match your unsupervised data list.
+# you can follow this process to generate your own dataset.
+# I have also included my code for extracting data in local/...
+
+# stage 1 is for training
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "********step 1 start time : $now ********"
+  mkdir -p $dir
+  # You have to rm `INIT_FILE` manually when you resume or restart a
+  # multi-machine training.
+  rm $dir/ddp_init
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+
+  # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
+  # and it should be positioned at data/${train_set}/global_cmvn .
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+
+  # train.py rewrite $train_config to $dir/train.yaml with model input
+  # and output dimension, and $dir/train.yaml will be used for inference
+  # and export.
+  echo "checkpoint is "  ${checkpoint}
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    echo "gpu number  $i "
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data data/$train_set/$data_list \
+      --cv_data data/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+# In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy
+# please make sure your test and valid data.list are in the proper location.
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  # stage 5 we test with aishell dataset,
+  echo "******** step 2 start time : $now ********"
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+
+  # export model
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip \
+    --output_quant_file $dir/final_quant.zip
+  # Please specify decoding_chunk_size for unified streaming and
+  # non-streaming model. The default value is -1, which is full chunk
+  # for non-streaming inference.
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+
+  # test_wer
+  for mode in ${decode_modes}; do
+  {
+    #test_dir=$dir/test_${mode}_${target_pt}pt  # for target pt
+    test_dir=$dir/test_${mode}${average_num}pt   # for average pt
+    mkdir -p $test_dir
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data data/test/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --ctc_weight $ctc_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $test_dir/text \
+      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    echo "before compute-wer"
+    python tools/compute-wer.py --char=1 --v=1 \
+      data/test/text $test_dir/text > $test_dir/wer
+  } &
+  done
+
+#   dev_wer
+  for mode in ${decode_modes}; do
+  {
+    #test_dir=$dir/test_${mode}_${target_pt}pt  # for target pt
+    dev_dir=$dir/dev_${mode}${average_num}pt   # for average pt
+    mkdir -p $dev_dir
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data data/dev/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --ctc_weight $ctc_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $dev_dir/text \
+      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    echo "before compute-wer"
+    python tools/compute-wer.py --char=1 --v=1 \
+      data/dev/text $dev_dir/text > $dev_dir/wer
+  } &
+  done
+  wait
+fi
+
+
+# split the (unsupervised) datalist into N sublists, where N depends on the number of available cpu in your cluster.
+# when making inference, we compute N sublist in parallel.
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ ${enable_nst} -eq 0 ]; then
+  echo "********step 3 start time : $now ********"
+  python local/split_data_list.py \
+    --job_nums $num_split \
+    --data_list_path data/train/$unsupervised_data_list \
+    --output_dir data/train/$dir_split
+
+fi
+
+
+# stage 4 will perform inference without language model on the given sublist(job num)
+# here is example usages:
+# bash run_nst.sh --stage 4 --stop-stage 4 --job_num $i --dir_split data/train/wenet_4khr_split_60/
+# --hypo_name hypothesis_0.txt --dir exp/conformer_aishell2_wenet4k_nst4
+# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
+# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
+# For each gpu, you can run with different job_num to perform data-wise parallel computing.
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "********step 4 start time : $now ********"
+  # we assume you have run stage 2 so that avg_${average_num}.pt exists
+  decode_checkpoint=$dir/avg_${average_num}.pt
+  # Please specify decoding_chunk_size for unified streaming and
+  # non-streaming model. The default value is -1, which is full chunk
+  # for non-streaming inference.
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+  mode="attention_rescoring"
+  gpu_id=0
+  echo "job number  ${job_num} "
+  echo "data_list dir is  ${dir_split}"
+  echo "hypo name is " $hypo_name
+  echo "dir is ${dir}"
+
+  python wenet/bin/recognize.py --gpu $gpu_id \
+    --mode $mode \
+    --config $dir/train.yaml \
+    --data_type $data_type \
+    --test_data data/train/${dir_split}data_sublist${job_num}/data_list \
+    --checkpoint $decode_checkpoint \
+    --beam_size 10 \
+    --batch_size 1 \
+    --penalty 0.0 \
+    --dict $dict \
+    --ctc_weight $ctc_weight \
+    --reverse_weight $reverse_weight \
+    --result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \
+    ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    echo "end time : $now"
+
+fi
+
+
+# Generate wav.scp file and label.txt file(optional) for each sublist we generated in step 3.
+# the wav_dir should be prepared in data processing step as we mentioned.
+#You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data,
+# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
+# wav_dir is the directory that stores raw wav file and possible labels.
+# if you have label for unsupervised dataset, set label = 1 other wise keep it 0
+# For each gpu or cpu, you can run with different job_num to perform data-wise parallel computing.
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && [ ${enable_nst} -eq 0 ]; then
+  echo "********step 5 start time : $now ********"
+  python local/get_wav_labels.py \
+    --dir_split data/train/${dir_split} \
+    --hypo_name /$hypo_name \
+    --wav_dir $wav_dir\
+    --job_num $job_num \
+    --label $label
+fi
+
+# Calculate cer-hypo between hypothesis with and without language model.
+# We assumed that you have finished language model
+# training using the wenet aishell-1 pipline. (You should have data/lang/words.txt , data/lang/TLG.fst files ready.)
+# Here is an exmaple usage:
+# bash run_nst.sh --stage 5 --stop-stage 5 --job_num n --dir_split data/train/wenet1k_redo_split_60/
+# --cer_hypo_dir wenet1k_cer_hypo --hypo_name hypothesis_nst.txt --dir exp/conformer_no_filter_redo_nst6
+# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
+# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
+# For each gpu, you can run with different job_num to perform data-wise parallel computing.
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  echo "********step 6 start time : $now ********"
+  chunk_size=-1
+  mode="attention_rescoring"
+  test_dir=$dir/test_${mode}_${job_num}
+  now=$(date +"%T")
+  echo "start time : $now"
+  echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
+  echo "nj is" $nj "hypo_file is" $hypo_name "cer out is" $cer_hypo_dir "lm is 4gram"
+  echo "dir is " $dir
+  if [ ! -f data/train/${dir_split}data_sublist${job_num}/${hypo_name}  ]; then
+  echo "text file does not exists"
+  exit 1;
+  fi
+
+  ./tools/decode.sh --nj 16 \
+    --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
+    --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
+    --chunk_size $chunk_size \
+    --fst_path data/lang_test/TLG.fst \
+    data/train/${dir_split}data_sublist${job_num}/wav.scp \
+    data/train/${dir_split}data_sublist${job_num}/${hypo_name} $dir/final.zip \
+    data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_hypo_dir}_${job_num}
+  now=$(date +"%T")
+  echo "end time : $now"
+fi
+
+# (optional, only run this stage if you have true label for unsupervised data.)
+# Calculate cer-label between true label and hypothesis with language model.
+# You can use the output cer to evaluate NST's performance.
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && [ ${label} -eq 1 ]; then
+  echo "********step 7 start time : $now ********"
+  chunk_size=-1
+  mode="attention_rescoring"
+  test_dir=$dir/test_${mode}_${job_num}
+  now=$(date +"%T")
+  echo "start time : $now"
+  echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
+  echo "nj is" $nj "label_file is" $label_file "cer out is" $cer_label_dir "lm is 4gram"
+  echo "dir is " $dir
+  echo "label_file " data/train/${dir_split}data_sublist${job_num}/${label_file}
+  if [ ! -f data/train/${dir_split}data_sublist${job_num}/${label_file}  ]; then
+  echo "text file does not exists"
+  exit 1;
+  fi
+
+  ./tools/decode.sh --nj 16 \
+    --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
+    --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
+    --chunk_size $chunk_size \
+    --fst_path data/lang_test/TLG.fst \
+    data/train/${dir_split}data_sublist${job_num}/wav.scp \
+    data/train/${dir_split}data_sublist${job_num}/${label_file} $dir/final.zip \
+    data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_label_dir}_${job_num}
+  now=$(date +"%T")
+  echo "end time : $now"
+fi
+
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+  echo "********step 8 start time : $now ********"
+  python local/generate_filtered_pseudo_label.py  \
+    --cer_hypo_dir $cer_hypo_dir \
+    --untar_dir data/train/$untar_dir \
+    --wav_dir $wav_dir \
+    --dir_num $job_num \
+    --cer_hypo_threshold $cer_hypo_threshold \
+    --speak_rate_threshold $speak_rate_threshold \
+    --dir $dir \
+    --tar_dir data/train/$tar_dir \
+    --utter_time_file $utter_time_file
+
+  python local/generate_data_list.py  \
+    --tar_dir data/train/$tar_dir \
+    --out_data_list data/train/$out_data_list \
+    --supervised_data_list data/train/$supervised_data_list \
+    --pseudo_data_ratio $pseudo_data_ratio
+
+fi
+
+
+
--- a/examples/aishell/rnnt/README.md
+++ b/examples/aishell/rnnt/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 100 epochs, dither 0.1
+* Training weight info: transducer_weight 0.75, ctc_weight 0.1, attention_weight 0.15, average_num 10
+* Predictor type: lstm
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| rnnt greedy search        | 5.24  |
+
+* after 165 epochs and avg 30
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| rnnt greedy search        | 5.02  |
+| ctc prefix beam search    | 5.17  |
+| ctc prefix beam + rescore | 4.48  |
+
+## Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 140 epochs, dither 0.1
+* Training weight info: transducer_weight 0.4, ctc_weight 0.2, attention_weight 0.4, average_num 10
+* Predictor type: lstm
+* Model link: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20220728_conformer_rnnt_exp.tar.gz
+
+| decoding mode                         | CER   |
+|---------------------------------------|-------|
+| rnnt greedy search                    | 4.88  |
+| rnnt beam search                      | 4.67  |
+| ctc prefix beam search                | 5.02  |
+| ctc prefix beam + rescore             | 4.51  |
+| ctc prefix beam + rnnt&attn rescore   | 4.45  |
+| rnnt prefix beam + rnnt&attn rescore  | 4.49  |
+
+
+## U2++ Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, oneline speed perturb
+* Training info: lr 0.001, batch size 4, 32 gpu, acc_grad 1, 360 epochs
+* Training weight info: transducer_weight 0.75,  ctc_weight 0.1, reverse_weight 0.15  average_num 30
+* Predictor type: lstm
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| rnnt greedy search        | 5.68  | 6.26  |
+
+## Pretrain
+* Pretrain model: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_exp.tar.gz
+* Feature info: using fbank feature, dither, cmvn, oneline speed perturb
+* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 140 epochs
+* Training weight info: transducer_weight 0.4,  ctc_weight 0.2 , attention_weight 0.4, reverse_weight 0.3  average_num 30
+* Predictor type: lstm
+
+| decoding mode/chunk size    | full  | 16     |
+|-----------------------------|-------|--------|
+| rnnt greedy search          | 5.21  | 5.73   |
+| rnnt prefix beam            | 5.14  | 5.63   |
+| rnnt prefix beam + rescore  | 4.73  | 5.095  |
+
+
+## Training loss ablation study
+
+note:
+
+- If rnnt is checked, greedy means rnnt  greedy search; so is beam
+
+- if rnnt is checked, rescoring means rnnt beam & attention rescoring
+
+- if only 'ctc & att' is checked, greedy means ctc gredy search; so is beam
+
+- if only  'ctc & att' (AED)  is checked, rescoring means ctc beam & attention rescoring
+
+- what if rnnt model do search of wenet's style, comming soon
+
+| rnnt | ctc | att | greedy | beam | rescoring | fusion |
+|------|-----|-----|--------|------|-----------|--------|
+| ✔    | ✔   | ✔   |   4.88 | 4.67 |      4.45 |   4.49 |
+| ✔    | ✔   |     |   5.56 | 5.46 |       /   |   5.40 |
+| ✔    |     | ✔   |   5.03 | 4.94 |      4.87 |    /   |
+| ✔    |     |     |   5.64 | 5.59 |       /   |    /   |
+|      | ✔   | ✔   |   4.94 | 4.94 |      4.61 |    /   |
--- a/examples/aishell/rnnt/conf/conformer_rnnt.yaml
+++ b/examples/aishell/rnnt/conf/conformer_rnnt.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: true
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+
+joint_conf:
+    join_dim: 512
+    prejoin_linear: True
+    postjoin_linear: false
+    joint_mode: 'add'
+    activation: 'tanh'
+
+predictor: rnn
+predictor_conf:
+    embed_size: 256
+    output_size: 256
+    embed_dropout: 0.1
+    hidden_size: 256
+    num_layers: 2
+    bias: true
+    rnn_type: 'lstm'
+    dropout: 0.1
+
+decoder: bitransformer
+decoder_conf:
+  attention_heads: 4
+  dropout_rate: 0.1
+  linear_units: 2048
+  num_blocks: 3
+  positional_dropout_rate: 0.1
+  r_num_blocks: 3
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+# hybrid transducer+ctc+attention
+model_conf:
+    transducer_weight: 0.75
+    ctc_weight: 0.1
+    attention_weight: 0.15
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 8
+
+grad_clip: 4
+accum_grad: 1
+max_epoch: 140
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+
--- a/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
+++ b/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+
+joint_conf:
+    join_dim: 512
+    prejoin_linear: True
+    postjoin_linear: false
+    joint_mode: 'add'
+    activation: 'tanh'
+
+predictor: rnn
+predictor_conf:
+    embed_size: 256
+    output_size: 256
+    embed_dropout: 0.1
+    hidden_size: 256
+    num_layers: 2
+    bias: true
+    rnn_type: 'lstm'
+    dropout: 0.1
+
+decoder: bitransformer
+decoder_conf:
+  attention_heads: 4
+  dropout_rate: 0.1
+  linear_units: 2048
+  num_blocks: 3
+  positional_dropout_rate: 0.1
+  r_num_blocks: 3
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+# hybrid transducer+ctc+attention
+model_conf:
+    transducer_weight: 0.75
+    ctc_weight: 0.1
+    attention_weight: 0.15
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 4
+
+grad_clip: 4
+accum_grad: 1
+max_epoch: 130
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
+++ b/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: true
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+
+joint_conf:
+    join_dim: 320
+    prejoin_linear: true
+    postjoin_linear: false
+    joint_mode: 'add'
+    activation: 'tanh'
+
+predictor: embedding
+predictor_conf:
+    embed_size: 320
+    embed_dropout: 0.1
+    n_head: 4
+    history_size: 5
+    bias: false
+
+decoder: bitransformer
+decoder_conf:
+  attention_heads: 4
+  dropout_rate: 0.1
+  linear_units: 2048
+  num_blocks: 3
+  positional_dropout_rate: 0.1
+  r_num_blocks: 3
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+# hybrid transducer+ctc+attention
+model_conf:
+    transducer_weight: 0.4
+    ctc_weight: 0.2
+    attention_weight: 0.4
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 30
+
+
+grad_clip: 4
+accum_grad: 1
+max_epoch: 500
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell/rnnt/local
+++ b/examples/aishell/rnnt/local
+../s0/local
\ No newline at end of file
--- a/examples/aishell/rnnt/path.sh
+++ b/examples/aishell/rnnt/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/aishell/rnnt/run.sh
+++ b/examples/aishell/rnnt/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#           2022 Binbin Zhang(binbizha@qq.com)
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+
+# The num of machines(nodes) for multi-machine training, 1 is for one machine.
+# NFS is required if num_nodes > 1.
+num_nodes=1
+
+# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
+# You should set the node_rank=0 on the first machine, set the node_rank=1
+# on the second machine, and so on.
+node_rank=0
+# The aishell dataset location, please change this to your own path
+# make sure of using absolute path. DO-NOT-USE relatvie path!
+data=/export/data/asr-data/OpenSLR/33/
+data_url=www.openslr.org/resources/33
+
+nj=16
+dict=data/dict/lang_char.txt
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+train_set=train
+train_config=conf/conformer_u2pp_rnnt.yaml
+cmvn=true
+dir=exp/conformer_rnnt
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="rnnt_beam_search"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  local/download_and_untar.sh ${data} ${data_url} data_aishell
+  local/download_and_untar.sh ${data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/aishell_data_prep.sh ${data}/data_aishell/wav \
+    ${data}/data_aishell/transcript
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # remove the space between the text labels for Mandarin dataset
+  for x in train dev test; do
+    cp data/${x}/text data/${x}/text.org
+    paste -d " " <(cut -f 1 -d" " data/${x}/text.org) \
+      <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
+      > data/${x}/text
+    rm data/${x}/text.org
+  done
+
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp data/${train_set}/wav.scp \
+    --out_cmvn data/$train_set/global_cmvn
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "Make a dictionary"
+  mkdir -p $(dirname $dict)
+  echo "<blank> 0" > ${dict}  # 0 is for "blank" in CTC
+  echo "<unk> 1"  >> ${dict}  # <unk> must be 1
+  tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
+    | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \
+    awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare data, prepare required format"
+  for x in dev test ${train_set}; do
+    if [ $data_type == "shard" ]; then
+      tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data/$x/wav.scp data/$x/text \
+        $(realpath data/$x/shards) data/$x/data.list
+    else
+      tools/make_raw_list.py data/$x/wav.scp data/$x/text \
+        data/$x/data.list
+    fi
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  mkdir -p $dir
+  # You have to rm `INIT_FILE` manually when you resume or restart a
+  # multi-machine training.
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+
+  # train.py rewrite $train_config to $dir/train.yaml with model input
+  # and output dimension, and $dir/train.yaml will be used for inference
+  # and export.
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data data/$train_set/data.list \
+      --cv_data data/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Please specify decoding_chunk_size for unified streaming and
+  # non-streaming model. The default value is -1, which is full chunk
+  # for non-streaming inference.
+  decoding_chunk_size=
+  # only used in rescore mode for weighting different scores
+  rescore_ctc_weight=0.5
+  rescore_transducer_weight=0.5
+  rescore_attn_weight=0.5
+  # only used in beam search, either pure beam search mode OR beam search inside rescoring
+  search_ctc_weight=0.3
+  search_transducer_weight=0.7
+
+  reverse_weight=0.0
+  for mode in ${decode_modes}; do
+  {
+    test_dir=$dir/test_${mode}
+    mkdir -p $test_dir
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data data/test/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --ctc_weight $rescore_ctc_weight \
+      --transducer_weight $rescore_transducer_weight \
+      --attn_weight $rescore_attn_weight \
+      --search_ctc_weight $search_ctc_weight \
+      --search_transducer_weight $search_transducer_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $test_dir/text \
+      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    python tools/compute-wer.py --char=1 --v=1 \
+      data/test/text $test_dir/text > $test_dir/wer
+  } &
+  done
+  wait
+fi
--- a/examples/aishell/rnnt/tools
+++ b/examples/aishell/rnnt/tools
+../../../tools
\ No newline at end of file
--- a/examples/aishell/rnnt/wenet
+++ b/examples/aishell/rnnt/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.002, batch size 18, 4 gpu, acc_grad 4, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| attention decoder         | 5.18  |
+| ctc greedy search         | 4.94  |
+| ctc prefix beam search    | 4.94  |
+| attention rescoring       | 4.61  |
+| LM + attention rescoring  | 4.36  |
+
+## U2++ Conformer Result
+
+* Feature info: using fbank feature, dither=1.0, cmvn, oneline speed perturb
+* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 360 epochs
+* Decoding info: ctc_weight 0.3, reverse_weight 0.5  average_num 30, lm_scale 0.7, decoder_scale 0.1, r_decoder_scale 0.7
+* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| ctc greedy search         | 5.19  | 5.81  |
+| ctc prefix beam search    | 5.17  | 5.81  |
+| attention rescoring       | 4.63  | 5.05  |
+| LM + attention rescoring  | 4.40  | 4.75  |
+| HLG(k2 LM)                | 4.81  | 5.27  |
+| HLG(k2 LM)  + attention rescoring | 4.32  | 4.70  |
+
+## Unified Conformer Result
+
+* Feature info: using fbank feature, dither=0, cmvn, oneline speed perturb
+* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 180 epochs, dither 0.0
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
+
+| decoding mode/chunk size  | full  | 16    | 8     | 4     |
+|---------------------------|-------|-------|-------|-------|
+| attention decoder         | 5.40  | 5.60  | 5.74  | 5.86  |
+| ctc greedy search         | 5.56  | 6.29  | 6.68  | 7.10  |
+| ctc prefix beam search    | 5.57  | 6.30  | 6.67  | 7.10  |
+| attention rescoring       | 5.05  | 5.45  | 5.69  | 5.91  |
+| LM + attention rescoring  | 4.73  | 5.08  | 5.22  | 5.38  |
+
+## U2++ Transformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb.
+* Training info: lr 0.001, batch size 26, 8 gpu, acc_grad 1, 360 epochs, dither 0.1
+* Decoding info: ctc_weight 0.2, reverse_weight 0.5, average_num 30
+* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| ctc greedy search         | 6.05  | 6.92  |
+| ctc prefix beam search    | 6.05  | 6.90  |
+| attention rescoring       | 5.11  | 5.63  |
+| LM + attention rescoring  | 4.82  | 5.24  |
+
+## Transformer Result
+
+* Feature info: using fbank feature, dither, with cmvn, online speed perturb.
+* Training info: lr 0.002, batch size 26, 4 gpu, acc_grad 4, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| attention decoder         | 5.69  |
+| ctc greedy search         | 5.92  |
+| ctc prefix beam search    | 5.91  |
+| attention rescoring       | 5.30  |
+| LM + attention rescoring  | 5.04  |
+
+## Unified Transformer Result
+
+* Feature info: using fbank feature, dither=0, with cmvn, online speed perturb.
+* Training info: lr 0.002, batch size 16, 4 gpu, acc_grad 1, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a
+
+| decoding mode/chunk size  | full  | 16    | 8     | 4     |
+|---------------------------|-------|-------|-------|-------|
+| attention decoder         | 6.04  | 6.35  | 6.45  | 6.70  |
+| ctc greedy search         | 6.28  | 6.99  | 7.39  | 7.89  |
+| ctc prefix beam search    | 6.28  | 6.98  | 7.40  | 7.89  |
+| attention rescoring       | 5.52  | 6.05  | 6.28  | 6.62  |
+| LM + attention rescoring  | 5.11  | 5.59  | 5.86  | 6.17  |
+
+## AMP Training Transformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.002, batch size, 4 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 25000
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 1bb4e5a269c535340fae5b0739482fa47733d2c1
+
+| decoding mode          | CER  |
+|------------------------|------|
+| attention decoder      | 5.73 |
+| ctc greedy search      | 5.92 |
+| ctc prefix beam search | 5.92 |
+| attention rescoring    | 5.31 |
+
+
+## Muilti-machines Training Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.004, batch size 16, 2 machines, 8\*2=16 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 10000
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: f6b1409023440da1998d31abbcc3826dd40aaf35
+
+| decoding mode          | CER  |
+|------------------------|------|
+| attention decoder      | 4.90 |
+| ctc greedy search      | 5.07 |
+| ctc prefix beam search | 5.06 |
+| attention rescoring    | 4.65 |
+
+
+## Conformer with/without Position Encoding Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.002, batch size 16, 8 gpu, acc_grad 4, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 20
+
+| decoding mode          | with PE | without PE |
+|------------------------|---------|------------|
+| attention decoder      | 5.18    | 5.73       |
+| ctc greedy search      | 4.94    | 4.97       |
+| ctc prefix beam search | 4.94    | 4.97       |
+| attention rescoring    | 4.61    | 4.69       |
+
+
+## Efficient Conformer v1 Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 200 epochs
+* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 20
+
+| decoding mode          | full | 18   | 16   |
+|------------------------|------|------|------|
+| attention decoder      | 4.99 | 5.13 | 5.16 |
+| ctc prefix beam search | 4.98 | 5.23 | 5.23 |
+| attention rescoring    | 4.64 | 4.86 | 4.85 |
+
+
+## Efficient Conformer v2 Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 200 epochs
+* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 20
+
+| decoding mode          | full | 18   | 16   |
+|------------------------|------|------|------|
+| attention decoder      | 4.87 | 5.03 | 5.07 |
+| ctc prefix beam search | 4.97 | 5.18 | 5.20 |
+| attention rescoring    | 4.56 | 4.75 | 4.77 |
--- a/examples/aishell/s0/UIO_RESULT.md
+++ b/examples/aishell/s0/UIO_RESULT.md
+# Benchmark on Conformer
+
+| IO           | CER   |
+|--------------|-------|
+| Old          | 4.61  |
+| UIO(Raw)     | 4.63  |
+| UIO(Shards)  | 4.67  |
+
+
--- a/examples/aishell/s0/conf/train_conformer.yaml
+++ b/examples/aishell/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell/s0/conf/train_conformer_no_pos.yaml
+++ b/examples/aishell/s0/conf/train_conformer_no_pos.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'no_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell/s0/conf/train_transformer.yaml
+++ b/examples/aishell/s0/conf/train_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 26
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell/s0/conf/train_u2++_conformer.yaml
+++ b/examples/aishell/s0/conf/train_u2++_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    spec_sub: true
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 30
+    spec_trim: false
+    spec_trim_conf:
+        max_t: 50
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 360
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000