add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/librispeech/rnnt/conf/conformer_rnnt.yaml
+++ b/examples/librispeech/rnnt/conf/conformer_rnnt.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: true
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+
+joint_conf:
+    join_dim: 512
+    prejoin_linear: True
+    postjoin_linear: false
+    joint_mode: 'add'
+    activation: 'tanh'
+
+predictor: rnn
+predictor_conf:
+    embed_size: 256
+    output_size: 256
+    embed_dropout: 0.1
+    hidden_size: 256
+    num_layers: 2
+    bias: true
+    rnn_type: 'lstm'
+    dropout: 0.1
+
+decoder: bitransformer
+decoder_conf:
+  attention_heads: 4
+  dropout_rate: 0.1
+  linear_units: 2048
+  num_blocks: 3
+  positional_dropout_rate: 0.1
+  r_num_blocks: 3
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+# hybrid transducer+ctc+attention
+model_conf:
+    transducer_weight: 0.75
+    ctc_weight: 0.1
+    attention_weight: 0.15
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 1650
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        max_frames_in_batch: 4000
+
+grad_clip: 4
+accum_grad: 1
+max_epoch: 140
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+
--- a/examples/librispeech/rnnt/local
+++ b/examples/librispeech/rnnt/local
+../s0/local/
\ No newline at end of file
--- a/examples/librispeech/rnnt/path.sh
+++ b/examples/librispeech/rnnt/path.sh
+../s0/path.sh
\ No newline at end of file
--- a/examples/librispeech/rnnt/run.sh
+++ b/examples/librispeech/rnnt/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=-1 # start from 0 if you need to start from data preparation
+stop_stage=7
+# data
+data_url=www.openslr.org/resources/12
+# data_url=https://us.openslr.org/resources/12
+data_url=https://openslr.elda.org/resources/12
+# use your own data path
+datadir=
+
+# wav data dir
+wave_data=data
+# Optional train_config
+# 1. conf/train_transformer_large.yaml: Standard transformer
+train_config=conf/conformer_rnnt.yaml
+checkpoint=
+cmvn=true
+do_delta=false
+
+dir=exp/conformer_transducer
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+# maybe you can try to adjust it if you can not get close results as README.md
+average_num=10
+decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
+
+. tools/parse_options.sh || exit 1;
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+dev_set=dev
+recog_set="test_clean test_other dev_clean dev_other"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  for part in train-clean-100 train-clean-360 train-other-500; do
+    local/download_and_untar.sh ${datadir} ${data_url} ${part}
+  done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  ### Task dependent. You have to make data the following preparation part by yourself.
+  ### But you can utilize Kaldi recipes in most cases
+  echo "stage 0: Data preparation"
+  for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    # use underscore-separated names in data directories.
+    local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_}
+  done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  ### Task dependent. You have to design training and dev sets by yourself.
+  ### But you can utilize Kaldi recipes in most cases
+  echo "stage 1: Feature Generation"
+  mkdir -p $wave_data/train_960
+  # merge total training data
+  for set in train_clean_100 train_clean_360 train_other_500; do
+    for f in `ls $wave_data/$set`; do
+      cat $wave_data/$set/$f >> $wave_data/train_960/$f
+    done
+  done
+  mkdir -p $wave_data/dev
+  # merge total dev data
+  for set in dev_clean dev_other; do
+    for f in `ls $wave_data/$set`; do
+      cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
+    done
+  done
+
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp $wave_data/$train_set/wav.scp \
+    --out_cmvn $wave_data/$train_set/global_cmvn
+
+fi
+
+
+dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char/
+
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
+  tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # Prepare wenet required data
+  echo "Prepare data, prepare required format"
+  for x in $dev_set ${recog_set} $train_set ; do
+    tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
+        $wave_data/$x/data.list
+  done
+
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  dist_backend="nccl"
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    python3 wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type raw \
+      --symbol_table $dict \
+      --bpe_model ${bpemodel}.model \
+      --train_data $wave_data/$train_set/data.list \
+      --cv_data $wave_data/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $num_gpus \
+      --ddp.rank $i \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 4 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  # TODO, Add model average here
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for test in $recog_set; do
+    for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/${test}_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu $gpu_id \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type raw \
+          --dict $dict \
+          --bpe_model ${bpemodel}.model \
+          --test_data $wave_data/$test/data.list \
+          --checkpoint $decode_checkpoint \
+          --beam_size 10 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+        tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+          < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
+
+        python tools/compute-wer.py --char=1 --v=1 \
+          $wave_data/$test/text $test_dir/text > $test_dir/wer
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+  done
+  wait
+
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  lm=data/local/lm
+  lexicon=data/local/dict/lexicon.txt
+  mkdir -p $lm
+  mkdir -p data/local/dict
+
+  # 7.1 Download & format LM
+  which_lm=3-gram.pruned.1e-7.arpa.gz
+  if [ ! -e ${lm}/${which_lm} ]; then
+    wget http://www.openslr.org/resources/11/${which_lm} -P ${lm}
+  fi
+  echo "unzip lm($which_lm)..."
+  gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa
+  echo "Lm saved as ${lm}/lm.arpa"
+
+  # 7.2 Prepare dict
+  unit_file=$dict
+  bpemodel=$bpemodel
+  # use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel)
+  # if you download pretrained librispeech conformer model
+  cp $unit_file data/local/dict/units.txt
+  if [ ! -e ${lm}/librispeech-lexicon.txt ]; then
+    wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm}
+  fi
+  echo "build lexicon..."
+  tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \
+    $lexicon $bpemodel.model
+  echo "lexicon saved as '$lexicon'"
+
+  # 7.3 Build decoding TLG
+  tools/fst/compile_lexicon_token_fst.sh \
+     data/local/dict data/local/tmp data/local/lang
+  tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+
+  # 7.4 Decoding with runtime
+  fst_dir=data/lang_test
+  for test in ${recog_set}; do
+    ./tools/decode.sh --nj 6 \
+      --beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \
+      --ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \
+      --fst_path $fst_dir/TLG.fst \
+      --dict_path $fst_dir/words.txt \
+      data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \
+      $dir/lm_with_runtime_${test}
+    tail $dir/lm_with_runtime_${test}/wer
+  done
+fi
+
--- a/examples/librispeech/rnnt/tools
+++ b/examples/librispeech/rnnt/tools
+../../../tools/
\ No newline at end of file
--- a/examples/librispeech/rnnt/wenet
+++ b/examples/librispeech/rnnt/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
+# Performance Record
+
+## Conformer Result Bidecoder (large)
+
+* Encoder FLOPs(30s): 96,238,430,720, params: 85,709,704
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info: train_conformer_bidecoder_large.yaml, kernel size 31, lr 0.002, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
+* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
+* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
+* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz)
+* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz)
+* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
+
+| decoding mode                    | test clean | test other |
+|----------------------------------|------------|------------|
+| ctc prefix beam search           | 2.96       | 7.14       |
+| attention rescoring              | 2.66       | 6.53       |
+| LM-tgmed + attention rescoring   | 2.78       | 6.32       |
+| LM-tglarge + attention rescoring | 2.68       | 6.10       |
+| LM-fglarge + attention rescoring | 2.65       | 5.98       |
+
+## SqueezeFormer Result (U2++, FFN:2048)
+
+* Encoder info:
+    * SM12, reduce_idx 5, recover_idx 11, conv1d, batch_norm, syncbn
+    * encoder_dim 512, output_size 512, head 8, ffn_dim 512*4=2048
+    * Encoder FLOPs(30s): 82,283,704,832, params: 85,984,648
+* Feature info:
+    * using fbank feature, cmvn, dither, online speed perturb, spec_aug
+* Training info:
+    * train_squeezeformer_bidecoder_large.yaml, kernel size 31
+    * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
+    * adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
+* Decoding info:
+    * ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+| decoding mode                    | dev clean | dev other | test clean | test other |
+|----------------------------------|-----------|-----------|------------|------------|
+| ctc greedy search                | 2.55      | 6.62      | 2.73       | 6.59       |
+| ctc prefix beam search           | 2.53      | 6.60      | 2.72       | 6.52       |
+| attention decoder                | 2.93      | 6.56      | 3.31       | 6.47       |
+| attention rescoring              | 2.19      | 6.06      | 2.45       | 5.85       |
+
+## Conformer Result
+
+* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info: train_conformer.yaml, kernel size 31, lr 0.004, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 30
+* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78
+* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
+
+| decoding mode                    | test clean | test other |
+|----------------------------------|------------|------------|
+| ctc greedy search                | 3.51       | 9.57       |
+| ctc prefix beam search           | 3.51       | 9.56       |
+| attention decoder                | 3.05       | 8.36       |
+| attention rescoring              | 3.18       | 8.72       |
+| attention rescoring (beam 50)    | 3.12       | 8.55       |
+| LM-fglarge + attention rescoring | 3.09       | 7.40       |
+
+## Conformer Result (12 layers, FFN:2048)
+* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info: train_squeezeformer.yaml, kernel size 31,
+* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
+* AdamW, lr 1e-3, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
+* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+| decoding mode                    | dev clean | dev other | test clean | test other |
+|----------------------------------|-----------|-----------|------------|------------|
+| ctc greedy search                | 3.49      | 9.59      | 3.66       | 9.59       |
+| ctc prefix beam search           | 3.49      | 9.61      | 3.66       | 9.55       |
+| attention decoder                | 3.52      | 9.04      | 3.85       | 8.97       |
+| attention rescoring              | 3.10      | 8.91      | 3.29       | 8.81       |
+
+## SqueezeFormer Result (SM12, FFN:1024)
+* Encoder info:
+    * SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn
+    * encoder_dim 256, output_size 256, head 4, ffn_dim 256*4=1024
+    * Encoder FLOPs(30s): 21,158,877,440, params: 22,219,912
+* Feature info:
+    * using fbank feature, cmvn, dither, online speed perturb
+* Training info:
+    * train_squeezeformer.yaml, kernel size 31,
+    * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
+    * adamw, lr=1e-3, noamhold, warmup=0.2, hold=0.3, lr_decay=1.0
+* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+| decoding mode                    | dev clean | dev other | test clean | test other |
+|----------------------------------|-----------|-----------|------------|------------|
+| ctc greedy search                | 3.49      | 9.24      | 3.51       | 9.28       |
+| ctc prefix beam search           | 3.44      | 9.23      | 3.51       | 9.25       |
+| attention decoder                | 3.59      | 8.74      | 3.75       | 8.70       |
+| attention rescoring              | 2.97      | 8.48      | 3.07       | 8.44       |
+
+## SqueezeFormer Result (SM12, FFN:2048)
+* Encoder info:
+    * SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn
+    * encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048
+    * encoder FLOPs(30s): 28,230,473,984, params: 34,827,400
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info:
+    * train_squeezeformer.yaml, kernel size 31
+    * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1
+    * adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0
+* Decoding info:
+    * ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+| decoding mode                    | dev clean | dev other | test clean | test other |
+|----------------------------------|-----------|-----------|------------|------------|
+| ctc greedy search                | 3.34      | 9.01      | 3.47       | 8.85       |
+| ctc prefix beam search           | 3.33      | 9.02      | 3.46       | 8.81       |
+| attention decoder                | 3.64      | 8.62      | 3.91       | 8.33       |
+| attention rescoring              | 2.89      | 8.34      | 3.10       | 8.03       |
+
+## SqueezeFormer Result (SM12, FFN:1312)
+* Encoder info:
+    * SM12, reduce_idx 5, recover_idx 11, conv1d, w/o syncbn
+    * encoder_dim 328, output_size 256, head 4, ffn_dim 328*4=1312
+    * encoder FLOPs(30s): 34,103,960,008, params: 35,678,352
+* Feature info:
+    * using fbank feature, cmvn, dither, online speed perturb
+* Training info:
+    * train_squeezeformer.yaml, kernel size 31,
+    * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0
+    * adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0
+* Decoding info:
+    * ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+| decoding mode                    | dev clean | dev other | test clean | test other |
+|----------------------------------|-----------|-----------|------------|------------|
+| ctc greedy search                | 3.20      | 8.46      | 3.30       | 8.58       |
+| ctc prefix beam search           | 3.18      | 8.44      | 3.30       | 8.55       |
+| attention decoder                | 3.38      | 8.31      | 3.89       | 8.32       |
+| attention rescoring              | 2.81      | 7.86      | 2.96       | 7.91       |
+
+## Conformer U2++ Result
+
+* Feature info: using fbank feature, cmvn, no speed perturb, dither
+* Training info: train_u2++_conformer.yaml lr 0.001, batch size 24, 8 gpu, acc_grad 1, 120 epochs, dither 1.0
+* Decoding info: ctc_weight 0.3,  reverse weight 0.5, average_num 30
+* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0
+
+test clean
+
+| decoding mode                  | full | 16   |
+|--------------------------------|------|------|
+| ctc prefix beam search         | 3.76 | 4.54 |
+| attention rescoring            | 3.32 | 3.80 |
+
+test other
+
+| decoding mode                  | full  | 16    |
+|--------------------------------|-------|-------|
+| ctc prefix beam search         | 9.50  | 11.52 |
+| attention rescoring            | 8.67  | 10.38 |
+
+## SqueezeFormer Result (U2++, FFN:2048)
+
+* Encoder info:
+    * SM12, reduce_idx 5, recover_idx 11, conv1d, layer_norm
+    * do_rel_shift false, warp_for_time, syncbn
+    * encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048
+    * Encoder FLOPs(30s): 28,255,337,984, params: 34,893,704
+* Feature info:
+    * using fbank feature, cmvn, dither, online speed perturb
+* Training info:
+    * train_squeezeformer.yaml, kernel size 31
+    * batch size 12, 8 gpu, acc_grad 2, 120 epochs, dither 1.0
+    * adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0
+* Decoding info:
+    * ctc_weight 0.3, reverse weight 0.5, average_num 30
+
+test clean
+
+| decoding mode                  | full | 16   |
+|--------------------------------|------|------|
+| ctc prefix beam search         | 3.45 | 4.34 |
+| attention rescoring            | 3.07 | 3.71 |
+
+test other
+
+| decoding mode                  | full  | 16    |
+|--------------------------------|-------|-------|
+| ctc prefix beam search         | 8.29  | 10.60 |
+| attention rescoring            | 7.58  | 9.60  |
+
+## Conformer U2 Result
+
+* Feature info: using fbank feature, cmvn, speed perturb, dither
+* Training info: train_unified_conformer.yaml lr 0.001, batch size 10, 8 gpu, acc_grad 1, 120 epochs, dither 1.0
+* Decoding info: ctc_weight 0.5, average_num 30
+* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78
+* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz)
+* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz)
+* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz)
+
+test clean
+
+| decoding mode                    | full | 16   |
+|----------------------------------|------|------|
+| ctc prefix beam search           | 4.26 | 5.00 |
+| attention decoder                | 3.05 | 3.44 |
+| attention rescoring              | 3.72 | 4.10 |
+| attention rescoring (beam 50)    | 3.57 | 3.95 |
+| LM-tgmed + attention rescoring   | 3.56 | 4.02 |
+| LM-tglarge + attention rescoring | 3.40 | 3.82 |
+| LM-fglarge + attention rescoring | 3.38 | 3.74 |
+
+test other
+
+| decoding mode                    | full  | 16    |
+|----------------------------------|-------|-------|
+| ctc prefix beam search           | 10.87 | 12.87 |
+| attention decoder                | 9.07  | 10.44 |
+| attention rescoring              | 9.74  | 11.61 |
+| attention rescoring (beam 50)    | 9.34  | 11.13 |
+| LM-tgmed + attention rescoring   | 8.78  | 10.26 |
+| LM-tglarge + attention rescoring | 8.34  | 9.74  |
+| LM-fglarge + attention rescoring | 8.17  | 9.44  |
--- a/examples/librispeech/s0/conf/train_conformer.yaml
+++ b/examples/librispeech/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.0005
+        max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 70
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.004
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/librispeech/s0/conf/train_conformer_bidecoder_large.yaml
+++ b/examples/librispeech/s0/conf/train_conformer_bidecoder_large.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    cnn_module_norm: 'layer_norm'
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.0005
+        max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 50000
--- a/examples/librispeech/s0/conf/train_squeezeformer.yaml
+++ b/examples/librispeech/s0/conf/train_squeezeformer.yaml
+# network architecture
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'conv1d'
+    feed_forward_expansion_factor: 4
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    cnn_module_kernel: 31
+    cnn_norm_type: layer_norm
+    adaptive_scale: true
+    normalize_before: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.0005
+        max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 120
+log_interval: 100
+
+optim: adamw
+optim_conf:
+    lr: 1.e-3
+    weight_decay: 4.e-5
+
+scheduler: NoamHoldAnnealing
+scheduler_conf:
+    warmup_ratio: 0.2
+    hold_ratio: 0.3
+    max_steps: 87960
+    decay_rate: 1.0
+    min_lr: 1.e-5
--- a/examples/librispeech/s0/conf/train_squeezeformer_bidecoder_large.yaml
+++ b/examples/librispeech/s0/conf/train_squeezeformer_bidecoder_large.yaml
+# network architecture
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 512
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 4
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    cnn_module_kernel: 31
+    cnn_norm_type: batch_norm
+    adaptive_scale: true
+    normalize_before: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset_conf:
+    syncbn: true
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.0005
+        max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 100
+        max_f: 27
+        max_w: 80
+#        warp_for_time: true
+    spec_sub: true
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 120
+log_interval: 100
+
+optim: adamw
+optim_conf:
+    lr: 1.e-3
+    weight_decay: 4.e-5
+
+scheduler: NoamHoldAnnealing
+scheduler_conf:
+    warmup_ratio: 0.2
+    hold_ratio: 0.3
+    max_steps: 87960
+    decay_rate: 1.0
+    min_lr: 1.e-5
+
--- a/examples/librispeech/s0/conf/train_u2++_conformer.yaml
+++ b/examples/librispeech/s0/conf/train_u2++_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 400
+        token_min_length: 1
+        # min_output_input_ratio: 0.0005
+        # max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        warp_for_time: true
+    spec_sub: true
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 10000
+    sort: true
+    sort_conf:
+        sort_size: 2000  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 24
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/librispeech/s0/conf/train_u2++_squeezeformer.yaml
+++ b/examples/librispeech/s0/conf/train_u2++_squeezeformer.yaml
+# network architecture
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    time_reduction_layer_type: "stream"
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    cnn_module_kernel: 31
+    do_rel_shift: false
+    cnn_norm_type: layer_norm
+    adaptive_scale: true
+    normalize_before: false
+    causal: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 400
+        token_min_length: 1
+        # min_output_input_ratio: 0.0005
+        # max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+#        warp_for_time: true
+    spec_sub: true
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 10000
+    sort: true
+    sort_conf:
+        sort_size: 2000  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 2
+max_epoch: 120
+log_interval: 100
+
+optim: adamw
+optim_conf:
+    lr: 8.e-4
+    weight_decay: 4.e-5
+
+scheduler: NoamHoldAnnealing
+scheduler_conf:
+    warmup_ratio: 0.2
+    hold_ratio: 0.3
+    max_steps: 175680
+    decay_rate: 1.0
+    min_lr: 1.e-5
--- a/examples/librispeech/s0/conf/train_unified_conformer.yaml
+++ b/examples/librispeech/s0/conf/train_unified_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.0005
+        max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/librispeech/s0/local/data_prep_torchaudio.sh
+++ b/examples/librispeech/s0/local/data_prep_torchaudio.sh
+#!/bin/bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+  done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
--- a/examples/librispeech/s0/local/download_and_untar.sh
+++ b/examples/librispeech/s0/local/download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+  echo "          train-clean-100, train-clean-360, train-other-500."
+  exit 1
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0
+fi
+
+
+# sizes of the archive files in bytes.  This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release.  Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  if ! wget -P $data --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1
+  fi
+fi
+
+if ! tar -C $data -xvzf $data/$part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1
+fi
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+# data
+data_url=www.openslr.org/resources/12
+# use your own data path
+datadir=/export/data/en-asr-data/OpenSLR
+# wav data dir
+wave_data=data
+# Optional train_config
+# 1. conf/train_transformer_large.yaml: Standard transformer
+train_config=conf/train_conformer.yaml
+checkpoint=
+cmvn=true
+do_delta=false
+
+dir=exp/sp_spec_aug
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+# maybe you can try to adjust it if you can not get close results as README.md
+average_num=10
+decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
+
+. tools/parse_options.sh || exit 1;
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+dev_set=dev
+recog_set="test_clean test_other dev_clean dev_other"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    local/download_and_untar.sh ${datadir} ${data_url} ${part}
+  done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  ### Task dependent. You have to make data the following preparation part by yourself.
+  ### But you can utilize Kaldi recipes in most cases
+  echo "stage 0: Data preparation"
+  for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    # use underscore-separated names in data directories.
+    local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_}
+  done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  ### Task dependent. You have to design training and dev sets by yourself.
+  ### But you can utilize Kaldi recipes in most cases
+  echo "stage 1: Feature Generation"
+  mkdir -p $wave_data/train_960
+  # merge total training data
+  for set in train_clean_100 train_clean_360 train_other_500; do
+    for f in `ls $wave_data/$set`; do
+      cat $wave_data/$set/$f >> $wave_data/train_960/$f
+    done
+  done
+  mkdir -p $wave_data/dev
+  # merge total dev data
+  for set in dev_clean dev_other; do
+    for f in `ls $wave_data/$set`; do
+      cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
+    done
+  done
+
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp $wave_data/$train_set/wav.scp \
+    --out_cmvn $wave_data/$train_set/global_cmvn
+
+fi
+
+
+dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char/
+
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
+  tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # Prepare wenet required data
+  echo "Prepare data, prepare required format"
+  for x in $dev_set ${recog_set} $train_set ; do
+    tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
+        $wave_data/$x/data.list
+  done
+
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type raw \
+      --symbol_table $dict \
+      --bpe_model ${bpemodel}.model \
+      --train_data $wave_data/$train_set/data.list \
+      --cv_data $wave_data/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $num_gpus \
+      --ddp.rank $i \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  # TODO, Add model average here
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for test in $recog_set; do
+    for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/${test}_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu $gpu_id \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type raw \
+          --dict $dict \
+          --bpe_model ${bpemodel}.model \
+          --test_data $wave_data/$test/data.list \
+          --checkpoint $decode_checkpoint \
+          --beam_size 10 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+        tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+          < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
+
+        python tools/compute-wer.py --char=1 --v=1 \
+          $wave_data/$test/text $test_dir/text > $test_dir/wer
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+  done
+  wait
+
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  lm=data/local/lm
+  lexicon=data/local/dict/lexicon.txt
+  mkdir -p $lm
+  mkdir -p data/local/dict
+
+  # 7.1 Download & format LM
+  which_lm=3-gram.pruned.1e-7.arpa.gz
+  if [ ! -e ${lm}/${which_lm} ]; then
+    wget http://www.openslr.org/resources/11/${which_lm} -P ${lm}
+  fi
+  echo "unzip lm($which_lm)..."
+  gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa
+  echo "Lm saved as ${lm}/lm.arpa"
+
+  # 7.2 Prepare dict
+  unit_file=$dict
+  bpemodel=$bpemodel
+  # use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel)
+  # if you download pretrained librispeech conformer model
+  cp $unit_file data/local/dict/units.txt
+  if [ ! -e ${lm}/librispeech-lexicon.txt ]; then
+    wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm}
+  fi
+  echo "build lexicon..."
+  tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \
+    $lexicon $bpemodel.model
+  echo "lexicon saved as '$lexicon'"
+
+  # 7.3 Build decoding TLG
+  tools/fst/compile_lexicon_token_fst.sh \
+     data/local/dict data/local/tmp data/local/lang
+  tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+
+  # 7.4 Decoding with runtime
+  fst_dir=data/lang_test
+  for test in ${recog_set}; do
+    ./tools/decode.sh --nj 6 \
+      --beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \
+      --ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \
+      --fst_path $fst_dir/TLG.fst \
+      --dict_path $fst_dir/words.txt \
+      data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \
+      $dir/lm_with_runtime_${test}
+    tail $dir/lm_with_runtime_${test}/wer
+  done
+fi
+
--- a/examples/librispeech/s0/tools
+++ b/examples/librispeech/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/librispeech/s0/wenet
+++ b/examples/librispeech/s0/wenet
+../../../wenet
\ No newline at end of file