add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/multi_cn/s0/local/thchs-30_data_prep.sh
+++ b/examples/multi_cn/s0/local/thchs-30_data_prep.sh
+#!/bin/bash
+# Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+#           2016  LeSpeech (Author: Xingyu Na)
+
+#This script pepares the data directory for thchs30 recipe.
+#It reads the corpus and get wav.scp and transcriptions.
+
+corpus_dir=$1
+data=$2
+
+echo "**** Creating THCHS-30 data folder ****"
+mkdir -p $data/{train,dev,test}
+
+#create wav.scp, utt2spk.scp, spk2utt.scp, text
+(
+for x in train dev test; do
+  echo "cleaning $data/$x"
+  part=$data/$x
+  rm -rf $part/{wav.scp,utt2spk,spk2utt,text}
+  echo "preparing scps and text in $part"
+  # updated new "for loop" figured out the compatibility issue with Mac     created by Xi Chen, in 03/06/2018
+  for nn in `find  $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
+      spkid=`echo $nn | awk -F"_" '{print "" $1}'`
+      spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`
+      spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'`
+      spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")
+      utt_num=`echo $nn | awk -F"_" '{print $2}'`
+      uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")
+      echo $uttid $corpus_dir/$x/$nn.wav >> $part/wav.scp
+      echo $uttid $spkid >> $part/utt2spk
+      echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` | sed 's/ l =//' >> $part/text
+  done
+  sort $part/wav.scp -o $part/wav.scp
+  sort $part/utt2spk -o $part/utt2spk
+  sort $part/text -o $part/text
+  tools/utt2spk_to_spk2utt.pl $part/utt2spk > $part/spk2utt
+done
+) || exit 1
+
+tools/validate_data_dir.sh --no-feats $data/train || exit 1;
+tools/validate_data_dir.sh --no-feats $data/dev || exit 1;
+tools/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+
+
--- a/examples/multi_cn/s0/local/thchs_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/thchs_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+# Copyright   2016  Tsinghua University (author: Dong Wang)
+# Apache 2.0
+
+# Adapted from librispeech recipe local/download_and_untar.sh
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_thchs30, test-noise, resource"
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="data_thchs30 test-noise resource"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+sizes="6453425169 1971460210 24813708"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  pwd
+  echo " wget --no-check-certificate $full_url"
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
--- a/examples/multi_cn/s0/path.sh
+++ b/examples/multi_cn/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/multi_cn/s0/run.sh
+++ b/examples/multi_cn/s0/run.sh
+#!/bin/bash
+
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=6
+
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+# data
+dbase=/ssd/nfs06/di.wu/open_source
+aidatatang_url=www.openslr.org/resources/62
+aishell_url=www.openslr.org/resources/33
+magicdata_url=www.openslr.org/resources/68
+primewords_url=www.openslr.org/resources/47
+stcmds_url=www.openslr.org/resources/38
+thchs_url=www.openslr.org/resources/18
+
+nj=16
+
+train_set=train
+dev_set=dev
+
+has_aishell2=false  # AISHELL2 train set is not publically downloadable
+                    # With this option true, the script assumes you have it in
+                    # $dbase
+has_tal=false       # TAL data need download from Baidu SkyDrive
+                    # With this option true, the script assumes you have
+                    # TAL/TAL_ASR and TAL/TAL_ASR_mix in $dbase
+data_type=raw # raw or shard
+num_utts_per_shard=1000
+shards_dir= # specify if you prefer to store to somewhere else
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+train_config=conf/train_conformer.yaml
+# English modeling unit
+# Optional 1. bpe 2. char
+en_modeling_unit=bpe
+dict=data/dict_$en_modeling_unit/lang_char.txt
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="ctc_greedy_search ctc_prefix_beam_search"
+decode_modes="$decode_modes attention attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+test_sets="aishell aidatatang magicdata thchs"
+if $has_aishell2; then
+  test_sets="$test_sets aishell2"
+fi
+if $has_tal; then
+  test_sets="$test_sets tal_asr"
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  # download all training data
+  local/aidatatang_download_and_untar.sh $dbase/aidatatang $aidatatang_url \
+    aidatatang_200zh || exit 1;
+  local/aishell_download_and_untar.sh $dbase/aishell $aishell_url \
+    data_aishell || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
+    train_set || exit 1;
+  local/primewords_download_and_untar.sh $dbase/primewords $primewords_url \
+    || exit 1;
+  local/stcmds_download_and_untar.sh $dbase/stcmds $stcmds_url || exit 1;
+  local/thchs_download_and_untar.sh $dbase/thchs $thchs_url data_thchs30 || \
+    exit 1;
+
+  # download all test data
+  local/thchs_download_and_untar.sh $dbase/thchs $thchs_url test-noise \
+    || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
+    dev_set || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \
+    test_set || exit 1;
+  # tal data need download from Baidu SkyDrive
+  # AISHELL-2 database is free for academic research, not in the commerce,
+  # if without permission.
+  # You need to request the data from AISHELL company.
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/aidatatang_data_prep.sh $dbase/aidatatang/aidatatang_200zh \
+    data/aidatatang || exit 1;
+  local/aishell_data_prep.sh $dbase/aishell/data_aishell data/aishell \
+    || exit 1;
+  local/thchs-30_data_prep.sh $dbase/thchs/data_thchs30 data/thchs || exit 1;
+  local/magicdata_data_prep.sh $dbase/magicdata/ data/magicdata || exit 1;
+  local/primewords_data_prep.sh $dbase/primewords data/primewords || exit 1;
+  local/stcmds_data_prep.sh $dbase/stcmds data/stcmds || exit 1;
+  if $has_tal; then
+    local/tal_data_prep.sh $dbase/TAL/TAL_ASR data/tal_asr || exit 1;
+    local/tal_mix_data_prep.sh $dbase/TAL/TAL_ASR_mix data/tal_mix || exit 1;
+  fi
+  if $has_aishell2; then
+    local/aishell2_data_prep.sh $dbase/aishell2/IOS data/aishell2/train \
+      || exit 1;
+    local/aishell2_data_prep.sh $dbase/aishell2/IOS/dev data/aishell2/dev \
+      || exit 1;
+    local/aishell2_data_prep.sh $dbase/aishell2/IOS/test data/aishell2/test \
+      || exit 1;
+  fi
+  # Merge all data sets.
+  train_sets=aidatatang,aishell,magicdata,primewords,stcmds,thchs
+  dev_sets=aidatatang,aishell,magicdata,thchs
+  if $has_aishell2; then
+    train_sets=$train_sets,aishell2
+    dev_sets=$dev_sets,aishell2
+  fi
+  if $has_tal; then
+    train_sets=$train_sets,tal_asr,tal_mix
+    dev_sets=$dev_sets,tal_asr
+  fi
+  unrolled_train_sets=$(eval echo data/{$train_sets}/train)
+  unrolled_dev_sets=$(eval echo data/{$dev_sets}/dev)
+  tools/combine_data.sh data/train $unrolled_train_sets || exit 1;
+  tools/combine_data.sh data/dev $unrolled_dev_sets || exit 1;
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # For wav feature, just copy the data. Fbank extraction is done in training
+  mkdir -p data_${en_modeling_unit}
+  for x in ${train_set} ${dev_set}; do
+      cp -r data/$x data_${en_modeling_unit}
+  done
+
+  for x in ${test_sets}; do
+      cp -r data/$x/test data_${en_modeling_unit}/test_${x}
+  done
+
+  # Unified data format for char and bpe modelding
+  # Here we use ▁ for blank among english words
+  # Warning : it is "▁" symbol, not "_" symbol
+  for x in train dev; do
+    cp data_${en_modeling_unit}/${x}/text data_${en_modeling_unit}/${x}/text.org
+    paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/${x}/text.org) \
+      <(cut -f 2- -d" " data_${en_modeling_unit}/${x}/text.org \
+      | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
+      | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
+      > data_${en_modeling_unit}/${x}/text
+    sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/${x}/text
+
+  done
+
+  for x in ${test_sets}; do
+    cp data_${en_modeling_unit}/test_${x}/text \
+      data_${en_modeling_unit}/test_${x}/text.org
+    paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/test_${x}/text.org) \
+      <(cut -f 2- -d" " data_${en_modeling_unit}/test_${x}/text.org \
+      | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
+      | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
+      > data_${en_modeling_unit}/test_${x}/text
+    sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/test_${x}/text
+  done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "Compute cmvn"
+  # Here we use all the training data, you can sample some data to save time
+  if $cmvn; then
+    tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+      --in_scp data/${train_set}/wav.scp \
+      --out_cmvn data_${en_modeling_unit}/$train_set/global_cmvn
+  fi
+fi
+
+# This bpe model is trained on librispeech training data set.
+bpecode=conf/train_960_unigram5000.model
+trans_type_ops=
+enable_bpe=
+if [ $en_modeling_unit = "bpe" ]; then
+  trans_type_ops="--trans_type cn_char_en_bpe"
+  enable_bpe=true
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # Make train dict
+  echo "Make a dictionary"
+  mkdir -p $(dirname $dict)
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  tools/text2token.py -s 1 -n 1 -m ${bpecode} \
+    data_${en_modeling_unit}/${train_set}/text ${trans_type_ops} \
+    | cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' \
+    | grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \
+    | grep -v "…" | awk '{print $0 " " NR+1}' >> ${dict}
+
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "Prepare data, prepare required format"
+  feat_test_sets=""
+  for x in ${test_sets}; do
+    feat_test_sets=${feat_test_sets}" "test_${x}
+  done
+  for x in ${dev_set} ${train_set} ${feat_test_sets}; do
+    if [ $data_type == "shard" ]; then
+      sdir=${shards_dir:+$shards_dir/}shards_${en_modeling_unit}
+      mkdir -p $sdir
+      tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data_${en_modeling_unit}/$x/wav.scp \
+        data_${en_modeling_unit}/$x/text $(realpath $sdir/$x) \
+        data_${en_modeling_unit}/$x/data.list
+    else
+      tools/make_raw_list.py data_${en_modeling_unit}/$x/wav.scp \
+        data_${en_modeling_unit}/$x/text data_${en_modeling_unit}/$x/data.list
+    fi
+  done
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="nccl"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data_${en_modeling_unit}/$train_set/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data data_${en_modeling_unit}/$train_set/data.list \
+      --cv_data data_${en_modeling_unit}/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 4 \
+      ${enable_bpe:+--bpe_model $bpecode} \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+      decode_checkpoint=$dir/avg_${average_num}.pt
+      echo "do model average and final checkpoint is $decode_checkpoint"
+      python wenet/bin/average_model.py \
+          --dst_model $decode_checkpoint \
+          --src_path $dir  \
+          --num ${average_num} \
+          --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=16
+  ctc_weight=0.5
+  idx=0
+  for mode in ${decode_modes}; do
+  {
+    for x in ${test_sets}; do
+    {
+      test_name=test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size}
+      test_dir=$dir/$test_name/${x}
+      mkdir -p $test_dir
+      gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+      python wenet/bin/recognize.py --gpu $gpu_id \
+        --mode $mode \
+        --config $dir/train.yaml \
+        --data_type $data_type \
+        --test_data data_${en_modeling_unit}/test_${x}/data.list \
+        --checkpoint $decode_checkpoint \
+        --beam_size 10 \
+        --batch_size 1 \
+        --penalty 0.0 \
+        --dict $dict \
+        --ctc_weight $ctc_weight \
+        ${enable_bpe:+--bpe_model $bpecode} \
+        --result_file $test_dir/text_${en_modeling_unit} \
+        ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+      cat $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" \
+        > $test_dir/text
+      cat data_${en_modeling_unit}/test_${x}/text | sed -e "s/▁/ /g" \
+        > data_${en_modeling_unit}/test_${x}/text.tmp
+      python tools/compute-wer.py --char=1 --v=1 \
+        data_${en_modeling_unit}/test_${x}/text.tmp $test_dir/text \
+        > $test_dir/wer
+      rm data_${en_modeling_unit}/test_${x}/text.tmp
+    }
+    done
+  } &
+  ((idx+=1))
+  done
+  wait
+
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip \
+    --output_quant_file $dir/final_quant.zip
+fi
+
--- a/examples/multi_cn/s0/tools
+++ b/examples/multi_cn/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/multi_cn/s0/wenet
+++ b/examples/multi_cn/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/openasr2021/s0/README.md
+++ b/examples/openasr2021/s0/README.md
+# w2v-conformer based end-to-end model for Openasr2021 challenge
+
+This is a example to use unsupervised pretrained w2v-conformer model to fintune [OpenASR2021](https://www.nist.gov/itl/iad/mig/openasr-challenge) constrained-plus tasks.
+
+We pretrain conformer encoders using wav2vec 2.0 pre-training method , which we called ch-w2v-conformer. The original pre-training works take raw waveforms
+as input. Unlike these works, we use MFCC features as inputs.
+
+The ch-w2v-conformer model uses following datasets to pretrain:
+
+ISML datasets (6 languages,70k hours): internal dataset contains 40k hours Chinese, Cantonese, Tibetan, Inner Mongolian, Inner Kazakh, Uighur.
+
+Babel datasets (17 languages, 2k hours): Assamese, Bengali, Cantonese, Cebuano, Georgian, Haitian, Kazakh, Kurmanji, Lao, Pashto, Swahili, Tagalog, Tamil, Tok, Turkish, Vietnamese, Zulu
+
+After pretraining, we build ASR system based on CTC-Attention structure. In very low resource task, we find that if too many initialization network structures are constructed in the upper layer of pre-training conformer encoder, the migration performance of the pre-training model will be destroyed, so we only build a single-layer transformer decoder for joint training.
+
+pretrained model link: https://huggingface.co/emiyasstar/ch-w2v-conformer
+
+
+## constrained-plus Task Performance
+
+* Languages: Cantonese,mongolian,kazakh
+* config: conf/train_conformer_large_10h.yaml
+* Feature info: using mfcc feature, with dither 1.0, without cmvn
+* Training info: lr 0.001, batch size 10, 4 gpus on V100, acc_grad 1, 80 epochs
+* Decoding info: ctc_weight 0.5, average_num 35
+
+dev set results trained only with 10 hours training set
+
+## w2v-Conformer
+
+|   decoding_method   | Cantonese(CER)  | mongolian(WER) |
+|:-------------------:|:----:|:----:|
+|  ctc_greedy_search  | 31.46 | 53.64 |
+|  ctc_prefix_search |  31.47   | 53.50 |
+| attention_rescoring | 31.45 |  52.96 |
+
+## Conformer （train from scratch）
+
+
+|   decoding_method   |  Cantonese(CER)  | mongolian(WER) |
+|:-------------------:|----:|:----:|
+|  ctc_greedy_search  | 61.43 | 89.38 |
+|  ctc_prefix_search |  61.37   | 89.53|
+| attention_rescoring | 60.61 | 89.60|
--- a/examples/openasr2021/s0/conf/lang.conf
+++ b/examples/openasr2021/s0/conf/lang.conf
+# A giant configurations file for all the BABEL languages
+# as well as some training configurations for training HMM-GMM systems
+# for obtaining phoneme level alignments if you really want to do that
+# All paths starting with /export/* are set for the JHU/CLSP grid and shoudl
+# be changed appropriately for other users
+
+# Cantonese
+train_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build
+train_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/ct_train_openasr21_uniq
+train_data_dir_101_FLP=/export/babel/data/101-cantonese/release-current/conversational/training
+train_data_list_101_FLP=./conf/lists/101-cantonese/train.FullLP.list
+dev10h_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev
+dev10h_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev/ct_dev_openasr21_uniq
+lexicon_file_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/reference_materials/lexicon.txt
+lexiconFlags_101="--romanized --oov <unk>"
+
+
+# Kazakh
+train_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training
+train_data_list_302=./conf/lists/302-kazakh/sub-train.list
+train_data_dir_302_FLP=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training
+train_data_list_302_FLP=./conf/lists/302-kazakh/training.list
+dev10h_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/dev
+dev10h_data_list_302=./conf/lists/302-kazakh/dev.list
+lexicon_file_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/reference_materials/lexicon.sub-train.txt
+lexiconFlags_302="--romanized --oov <unk>"
+
+#mongolian
+train_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build
+train_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/mn_train_openasr21
+dev10h_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev
+dev10h_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev/mn_dev_openasr21
+lexicon_file_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/reference_materials/lexicon.txt
+lexiconFlags_401="--romanized --oov <unk>"
+
+
+oovSymbol="<unk>"
+lexiconFlags="--oov <unk>"
--- a/examples/openasr2021/s0/conf/train_conformer_large_10h.yaml
+++ b/examples/openasr2021/s0/conf/train_conformer_large_10h.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 24      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    attention_dropout_rate: 0.0
+    input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    macaron_style: True
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    cnn_module_norm: 'layer_norm'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 1
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.7
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+raw_wav: True
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    feats_type: mfcc
+    mfcc_conf:
+        num_mel_bins: 40
+        frame_shift: 10
+        frame_length: 25
+        num_ceps: 40
+        low_freq: 20
+        high_freq: -400
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 10
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 100
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.0004
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 15000
--- a/examples/openasr2021/s0/local/dump_wav.sh
+++ b/examples/openasr2021/s0/local/dump_wav.sh
+#!/bin/bash
+# dumps such pipe-style-wav to real audio file
+nj=1
+. tools/parse_options.sh || exit 1;
+
+inscp=$1
+segments=$2
+outscp=$3
+data=$(dirname ${inscp})
+if [ $# -eq 4 ]; then
+  logdir=$4
+else
+  logdir=${data}/log
+fi
+mkdir -p ${logdir}
+
+sox=`which sox`
+[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
+
+paste -d " " <(cut -f 1 -d " " $inscp) <(cut -f 2- -d " " $inscp | tr -t " " "#") \
+    > $data/wav_ori.scp
+
+tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp
+sed -i 's/ /,/g' $data/wav_segments.scp
+sed -i 's/#/ /g' $data/wav_segments.scp
+
+rm -f $logdir/wav_*.slice
+rm -f $logdir/*.log
+split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_
+
+for slice in `ls $logdir/wav_*.slice`; do
+{
+    name=`basename -s .slice $slice`
+    mkdir -p ${data}/wavs/${name}
+    cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \
+        -v logdir=$logdir -v name=$name '{
+        during=$4-$3
+        cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during;
+        system(cmd)
+        printf("%s %s/%s.wav\n", $1, data, $1);
+        }' | \
+       sort > ${data}/wavs_${name}.scp || exit 1;
+} &
+done
+wait
+cat ${data}/wavs_*.scp > $outscp
+rm ${data}/wavs_*.scp
+
+rm -f $data/{segments,wav_segments.scp,reco2file_and_channel,reco2dur}
+tools/fix_data_dir.sh $data
--- a/examples/openasr2021/s0/local/make_absolute.sh
+++ b/examples/openasr2021/s0/local/make_absolute.sh
+#!/bin/bash
+
+# This script replaces the command readlink -f (which is not portable).
+# It turns a pathname into an absolute pathname, including following soft links.
+target_file=$1
+
+cd $(dirname $target_file)
+target_file=$(basename $target_file)
+
+# Iterate down a (possible) chain of symlinks
+while [ -L "$target_file" ]; do
+    target_file=$(readlink $target_file)
+    cd $(dirname $target_file)
+    target_file=$(basename $target_file)
+done
+
+# Compute the canonicalized name by finding the physical path
+# for the directory we're in and appending the target file.
+phys_dir=$(pwd -P)
+result=$phys_dir/$target_file
+echo $result
--- a/examples/openasr2021/s0/local/make_corpus_subset.sh
+++ b/examples/openasr2021/s0/local/make_corpus_subset.sh
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0.
+
+#Begin configuration
+ignore_missing_txt=false  #If the reference transcript txt is missing, \
+                          #shall we ignore it or treat it as a fatal error?
+#End configuration
+echo "$0 $@"  # Print the command line for logging
+
+help_message="$0: create subset of the input directory (specified as the first directory).
+                 The subset is specified by the second parameter.
+                 The directory in which the subset should be created is the third parameter
+             Example:
+                 $0 <source-corpus-dir> <subset-descriptor-list-file> <target-corpus-subset-dir>"
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [[ "$#" -ne "3" ]] ; then
+    echo -e "FATAL: wrong number of script parameters!\n\n"
+    printf "$help_message\n\n"
+    exit 1;
+fi
+
+input_data_dir=$1
+input_data_list=$2
+output_data_dir=$3
+
+if [[ ! -d "$input_data_dir" ]] ; then
+  echo "FATAL: input data directory does not exist";
+  exit 1;
+fi
+if [[ ! -f "$input_data_list" ]] ; then
+  echo "FATAL: input data list file does not exist!";
+  exit 1;
+fi
+
+mkdir -p $output_data_dir/transcription
+mkdir -p $output_data_dir/audio
+
+abs_src_dir=`local/make_absolute.sh $input_data_dir`
+abs_tgt_dir=`local/make_absolute.sh $output_data_dir`
+
+echo "Making subset..."
+for file_basename in `cat $input_data_list`; do
+    echo $file_basename
+    if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then
+        ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1
+    else
+      if [[ -e $abs_src_dir/audio/$file_basename.wav ]] ; then
+        ln -sf $abs_src_dir/audio/$file_basename.wav $abs_tgt_dir/audio || exit 1
+      else
+        echo "File $abs_src_dir/audio/$file_basename.sph|wav does not exist!"
+        exit 1
+      fi
+    fi
+
+    if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then
+        ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1
+    else
+        echo "File $abs_src_dir/transcription/$file_basename.txt does not exist!"
+
+        if ! $ignore_missing_txt ; then
+          exit 1;
+        fi
+    fi
+done
+
+
+
--- a/examples/openasr2021/s0/local/prepare_acoustic_training_data.pl
+++ b/examples/openasr2021/s0/local/prepare_acoustic_training_data.pl
+#!/usr/bin/env perl
+use Getopt::Long;
+
+########################################################################
+#
+# Script to prepare the Babel acoustic training data for Kaldi.
+#
+#  -  Place transcripts in a file named "text"
+#     Each line contains: utteranceID word1 word2 ...
+#
+#  -  Place the utterance-to-speaker map in a file named "utt2spk"
+#     Each line contains: utteranceID speakerID
+#     speakerID MUST BE be a prefix of the utteranceID
+#     Kaldi code does not require it, but some training scripts do.
+#
+#   -  Place the utterance-to-segment map in a file named "segments"
+#      Each line contains: utteranceID recordingID startTime endTime
+#
+#   -  Place the recordingID-to-waveformFile map in "wav.scp"
+#      Each line contains: recordingIB Input_pipe_for_reading_waveform|
+#
+#  -  Place the speaker-utterance map in a file named "spk2utt"
+#     Each line contains: speakerID utteranceID_1 utteranceID_2 ...
+#     This is the inverse of the utt2spk mapping
+#
+# Note 1: the utteranceIDs in the first 3 files must match exactly, and
+#         the recordingIDSs in the last 2 files must match exactly.
+#
+# Note 2: Babel data formats and file-naming conventions are assumed.
+#
+#   -  The transcriptions and waveforms are in subdirectories named
+#        audio/<filename>.sph
+#        transcription/<filename>.txt
+#      There is 1 pair of files per recording, with extensions as above
+#
+#   -  The audio is in NIST sphere format, so shp2pipe may be used, e.g.
+#        BABEL_BP_101_11694_20111204_205320_inLine \
+#        /export/babel/sanjeev/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe \
+#        -f wav -p -c 1 \
+#        BABEL_BP_101_11694_20111204_205320_inLine.sph|
+#
+#   -  The filename contains speaker information, e.g.
+#        BABEL_BP_101_37210_20111102_170037_O1_scripted.sph -> 37210_A
+#        BABEL_BP_101_37210_20111102_172955_inLine.sph      -> 37210_A
+#        BABEL_BP_101_37210_20111102_172955_outLine.sph     -> 37210_B
+#      Specifically, the inLine speaker is the same as scripted
+#
+#   -  The transcription file has time marks in square brackets, e.g.
+#        [0.0]
+#        <no-speech> 喂 <no-speech>
+#        [7.05]
+#        啊 听 听唔听到 啊 <no-speech> 你 而家 仲未 上课 系 嘛 <no-speech>
+#        [14.07]
+#
+#  -  If a vocabulary is provided, map all OOV tokens to an OOV symbol,
+#     and write out an OOV list with counts to a file named "oovCounts"
+#
+#     If one or more word-fragment markers are provided, this script
+#     checks if an OOV token can be made in-vocabulary by stripping off
+#     the markers one  by one from either end of the token.
+#
+#     The default settings are
+#
+      $vocabFile = "";       # No vocab file; nothing is mapped to OOV
+      $OOV_symbol = "<unk>"; # Default OOV symbol
+      $fragMarkers = "";     # No characters are word-fragment markers
+#
+#  -  Babel transcriptions contain 4 kinds of untranscribed words
+#
+#         (())         designates unintelligible words
+#         <foreign>    designates a word in another language
+#         <prompt>     designates a sequence of pre-recorded words
+#         <overlap>    designates two simultaneous foreground speakers
+#
+#     This script maps them to OOV.  They are not included in oovCounts
+#
+#  -  Babel transcriptions also contain a few non-linguistics tokens
+#
+#         <limspack>   map to a vocal noise symbol
+#         <breath>     map to a vocal noise symbol
+#         <cough>      map to a vocal noise symbol
+#         <laugh>      map to a vocal noise symbol
+#
+#         <click>      map to a nonvocal noise symbol
+#         <ring>       map to a nonvocal noise symbol
+#         <dtmf>       map to a nonvocal noise symbol
+#         <int>        map to a nonvocal noise symbol
+#
+#         <no-speech>  designates silence > 1 sec.
+#
+      $vocalNoise = "<v-noise>";
+      $nVoclNoise = "<noise>";
+      $silence    = "<silence>";
+      $icu_transform="";
+#
+########################################################################
+
+GetOptions("fragmentMarkers=s" => \$fragMarkers,
+           "oov=s" => \$OOV_symbol,
+           "vocab=s" => \$vocabFile,
+           "icu-transform=s" => \$icu_transform
+           );
+
+if ($#ARGV == 1) {
+    $inDir  = $ARGV[0];
+    $outDir = $ARGV[1];
+    print STDERR ("$0: $inDir $outDir\n");
+    if($vocabFile) {
+    print STDERR ("\tLimiting transcriptions to words in $vocabFile\n");
+    print STDERR ("\tMapping OOV tokens to \"$OOV_symbol\"\n");
+    print STDERR ("\tif they remain OOV even after removing [$fragMarkers] from either end\n") if ($fragMarkers);
+    }
+    print STDERR ("$0 ADVICE: Use full path for the Input Directory\n") unless ($inDir=~m:^/:);
+} else {
+    print STDERR ("Usage: $0 [--options] InputDir OutputDir\n");
+    print STDERR ("\t--vocab <file>             File containing the permitted vocabulary\n");
+    print STDERR ("\t--oov <symbol>             Use this symbol for OOV words (default <unk>)\n");
+    print STDERR ("\t--fragmentMarkers <chars>  Remove these from ends of words to minimize OOVs (default none)\n");
+    exit(1);
+}
+
+########################################################################
+# Read and save the vocabulary and map anything not in the vocab <unk>
+########################################################################
+
+if ($vocabFile) {
+    open (VOCAB, $vocabFile)
+        || die "Unable to open vocabulary file $vocabFile";
+    $numWords = 0;
+    while (<VOCAB>) {
+        next unless (m:^([^\s]+):);
+        $numWords++ unless (exists $inVocab{$1}); # Don't count word repetitions
+        $inVocab{$1} = 1;                         # commonly found in lexicons
+    }
+    close(VOCAB);
+    print STDERR ("Read $numWords unique words from $vocabFile\n");
+}
+
+########################################################################
+# First read segmentation information from all the transcription files
+########################################################################
+
+$TranscriptionDir = "$inDir/transcription";
+if (-d $TranscriptionDir) {
+    @TranscriptionFiles = `ls ${TranscriptionDir}/*.txt`;
+    if ($#TranscriptionFiles >= 0) {
+        printf STDERR ("$0: Found %d .txt files in $TranscriptionDir\n", ($#TranscriptionFiles +1));
+        $numFiles = $numUtterances = $numWords = $numOOV = $numSilence = 0;
+        while ($filename = shift @TranscriptionFiles) {
+            $fileID =  $filename;     # To capture the base file name
+            $fileID =~ s:.+/::;       # remove path prefix
+            $fileID =~ s:\.txt\s*$::; # remove file extension
+            # For each transcription file, extract and save segmentation data
+            $numUtterancesThisFile = 0;
+            $prevTimeMark = -1.0;
+            $text = "";
+            if ( $icu_transform ) {
+              $inputspec="uconv -f utf8 -t utf8 -x \"$icu_transform\" $filename |";
+            } else {
+              $inputspec=$filename;
+            }
+            open (TRANSCRIPT, $inputspec) || die "Unable to open $filename";
+            while ($line=<TRANSCRIPT>) {
+                chomp $line;
+                if ($line =~ m:^\[([0-9]+\.*[0-9]*)\]$:) {
+                    $thisTimeMark = $1;
+                    if ($thisTimeMark < $prevTimeMark) {
+                      print STDERR ("$0 ERROR: Found segment with negative duration in $filename\n");
+                      print STDERR ("\tStart time = $prevTimeMark, End time = $thisTimeMark\n");
+                      print STDERR ("\tThis could be a sign of something seriously wrong!\n");
+                      print STDERR ("\tFix the file by hand or remove it from the directory, and retry.\n");
+                      exit(1);
+                    }
+                    if ($prevTimeMark<0) {
+                        # Record the first timemark and continue
+                        $prevTimeMark = $thisTimeMark;
+                        next;
+                    }
+                    ##################################################
+                    # Create an utteranceID using fileID & start time
+                    #    -  Assume Babel file naming conventions
+                    #    -  Remove prefix: program_phase_language
+                    #    -  inLine = scripted = spkr A, outLine = B
+                    #    -  Move A/B so that utteranceIDs sort by spkr
+                    #    -  Assume utterance start time < 10000 sec.
+                    ##################################################
+                    $utteranceID =  $fileID;
+                    $utteranceID =~ s:[^_]+_[^_]+_[^_]+_::;
+                    $utteranceID =~ s:([^_]+)_(.+)_(inLine|scripted):${1}_A_${2}:;
+                    $utteranceID =~ s:([^_]+)_(.+)_outLine:${1}_B_${2}:;
+                    $utteranceID .= sprintf ("_%06i", (100*$prevTimeMark));
+                    ##################################################
+                    # Then save segmentation, transcription, spkeaerID
+                    ##################################################
+                    if (exists $transcription{$utteranceID}) {
+                        # utteranceIDs should be unique, but this one is not!
+                        # Either time marks in the transcription file are bad,
+                        # or something went wrong in generating the utteranceID
+                        print STDERR ("$0 WARNING: Skipping duplicate utterance $utteranceID\n");
+                    }
+                    elsif ($text eq "") {
+                        # Could be due to text filtering done below
+                        # Output information to STDOUT to enable > /dev/null
+                        print STDOUT ("$0: Skipping empty transcription $utteranceID\n");
+                    } else {
+                        $transcription{$utteranceID} = $text;
+                        $startTime{$utteranceID} = $prevTimeMark;
+                        $endTime{$utteranceID} = $thisTimeMark;
+                        if ($utteranceID =~ m:([^_]+_[AB]).*:) {
+                            $speakerID{$utteranceID} = $1;
+                        } else {
+                            # default: one speaker per audio file
+                            $speakerID{$utteranceID} = $fileID;
+                        }
+                        $baseFileID{$utteranceID} = $fileID;
+                        $numUtterancesThisFile++;
+                        $numUtterances++;
+                        $text = "";
+                    }
+                    $prevTimeMark = $thisTimeMark;
+                } else {
+            @tokens = split(/\s+/, $line);
+            $text = "";
+            while ($w = shift(@tokens)) {
+            # First, some Babel-specific transcription filtering
+            if (($w eq "<sta>")||($w eq "<male-to-female>")||($w eq "<female-to-male>")||($w eq "~")) {
+                next;
+            } elsif (($w eq "<lipsmack>")||($w eq "<breath>")||($w eq "<cough>")||($w eq "<laugh>")) {
+                $text .= " $vocalNoise";
+                $numWords++;
+            } elsif (($w eq "<click>")||($w eq "<ring>")||($w eq "<dtmf>")||($w eq "<int>")){
+                $text .= " $nVoclNoise";
+                $numWords++;
+            } elsif (($w eq "(())")||($w eq "<foreign>")||($w eq "<overlap>")||($w eq "<prompt>")) {
+                $text .= " $OOV_symbol";
+                $oovCount{$w}++;
+                $numOOV++;
+                $numWords++;
+            } elsif ($w eq "<no-speech>") {
+                $text .= " $silence";
+                $numSilence++;
+            } else {
+                # This is a just regular spoken word
+                if ($vocabFile && (! $inVocab{$w}) && $fragMarkers) {
+                # $w is a potential OOV token
+                # Remove fragMarkers to see if $w becomes in-vocabulary
+                while ($w =~ m:^(\S+[$fragMarkers]|[$fragMarkers]\S+)$:) {
+                    if ($w =~ m:^(\S+)[$fragMarkers]$:) {
+                    $w = $1;
+                    last if ($inVocab{$w});
+                    } elsif ($w =~m:^[$fragMarkers](\S+)$:) {
+                    $w = $1;
+                    last if ($inVocab{$w});
+                    } else {
+                    die "Logically, the program should never reach here!";
+                    }
+                }
+                }
+                # If still an OOV, replace $w by $OOV_symbol
+                if ($vocabFile && (! $inVocab{$w})) {
+                # $w is definitely an OOV token
+                if (exists $oovCount{$w}) {
+                    $oovCount{$w}++;
+                } else {
+                    $oovCount{$w} = 1;
+                }
+                $w = $OOV_symbol;
+                $numOOV++;
+                }
+                $text .= " $w";
+                $numWords++;
+            }
+            }
+            $text =~ s:^\s+::; # Remove leading white space, if any
+                    # Transcriptions must contain real words to be useful in training
+                    $text =~ s:^(($OOV_symbol|$vocalNoise|$nVoclNoise|$silence)[ ]{0,1})+$::;
+        }
+        }
+            close(TRANSCRIPTION);
+            if ($numUtterancesThisFile>0) {
+                $lastTimeMarkInFile{$fileID} = $prevTimeMark;
+                $numUtterancesInFile{$fileID} = $numUtterancesThisFile;
+                $numUtterancesThisFile = 0;
+            }
+            $numFiles++;
+        }
+        print STDERR ("$0: Recorded $numUtterances non-empty utterances from $numFiles files\n");
+    } else {
+        print STDERR ("$0 ERROR: No .txt files found $TranscriptionDir\n");
+        exit(1);
+    }
+} else {
+    print STDERR ("$0 ERROR: No directory named $TranscriptionDir\n");
+    exit(1);
+}
+
+########################################################################
+# Then verify existence of corresponding audio files and their durations
+########################################################################
+
+$AudioDir = "$inDir/audio";
+if (-d $AudioDir) {
+    @AudioFiles = `ls ${AudioDir}/*.sph`;
+    if ($#AudioFiles >= 0) {
+        printf STDERR ("$0: Found %d .sph files in $AudioDir\n", ($#AudioFiles +1));
+        $numFiles = 0;
+        while ($filename = shift @AudioFiles) {
+            $fileID = $filename;
+            $fileID =~ s:.+/::;      # remove path prefix
+            $fileID =~ s:\.sph\s*::; # remove file extension
+            if (exists $numUtterancesInFile{$fileID}) {
+                # Some portion of this file has training transcriptions
+                @Info = `head $filename`;
+                $SampleCount = -1;
+                $SampleRate  = 8000; #default
+                while ($#Info>=0) {
+                   $line = shift @Info;
+                   $SampleCount = $1 if ($line =~ m:sample_count -i (\d+):);
+                   $SampleRate  = $1 if ($line =~ m:sample_rate -i (\d+):);
+                }
+                if ($SampleCount<0) {
+                    # Unable to extract a valid duration from the sphere header
+                    print STDERR ("Unable to extract duration: skipping file $filename");
+                } else {
+                    $waveformName{$fileID} = $filename; chomp $waveformName{$fileID};
+                    $duration{$fileID} = $SampleCount/$SampleRate;
+                    $numFiles++;
+                }
+            } else {
+                # Could be due to text filtering resulting in an empty transcription
+                # Output information to STDOUT to enable > /dev/null
+                print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n");
+            }
+        }
+        print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n");
+    } else {
+        print STDERR ("$0 NOTICE: No .sph files in $AudioDir\n");
+    }
+
+    @AudioFiles = `ls ${AudioDir}/*.wav`;
+    if ($#AudioFiles >= 0) {
+        $soxi=`which soxi` or die "Could not find soxi binary -- do you have sox installed?\n";
+        chomp $soxi;
+        printf STDERR ("$0: Found %d .wav files in $AudioDir\n", ($#AudioFiles +1));
+        print STDERR "Soxi found: $soxi\n";
+        $numFiles = 0;
+        while ($filename = shift @AudioFiles) {
+            $fileID = $filename;
+            $fileID =~ s:.+/::;      # remove path prefix
+            $fileID =~ s:\.wav\s*::; # remove file extension
+            if (exists $numUtterancesInFile{$fileID}) {
+                # Some portion of this file has training transcriptions
+                $duration = `$soxi -D $filename`;
+                if ($duration <=0) {
+                    # Unable to extract a valid duration from the sphere header
+                    print STDERR ("Unable to extract duration: skipping file $filename");
+                } else {
+                    if (exists $waveformName{$fileID} ) {
+                      print STDERR ("$0 ERROR: duplicate fileID \"$fileID\" for files \"$filename\" and \"" . $waveformName{$fileID} ."\"\n");
+                      exit(1);
+                    }
+                    $waveformName{$fileID} = $filename; chomp $waveformName{$fileID};
+                    $duration{$fileID} = $duration;
+                    $numFiles++;
+                }
+            } else {
+                # Could be due to text filtering resulting in an empty transcription
+                # Output information to STDOUT to enable > /dev/null
+                print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n");
+            }
+        }
+        print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n");
+    } else {
+        print STDERR ("$0 NOTICE: No .wav files in $AudioDir\n");
+    }
+    if ( $#waveformName == 0 ) {
+      print STDERR ("$0 ERROR: No audio files found!");
+    }
+} else {
+    print STDERR ("$0 ERROR: No directory named $AudioDir\n");
+    exit(1);
+}
+
+########################################################################
+# Now all the needed information is available.  Write out the 4 files.
+########################################################################
+
+unless (-d $outDir) {
+    print STDERR ("$0: Creating output directory $outDir\n");
+    die "Failed to create output directory" if (`mkdir -p $outDir`); # i.e. if the exit status is not zero.
+}
+print STDERR ("$0: Writing 5 output files to $outDir\n");
+
+$textFileName = "$outDir/text";
+open (TEXT, "> $textFileName") || die "$0 ERROR: Unable to write text file $textFileName\n";
+
+$utt2spkFileName = "$outDir/utt2spk";
+open (UTT2SPK, "> $utt2spkFileName") || die "$0 ERROR: Unable to write utt2spk file $utt2spkFileName\n";
+
+$segmentsFileName = "$outDir/segments";
+open (SEGMENTS, "> $segmentsFileName") || die "$0 ERROR: Unable to write segments file $segmentsFileName\n";
+
+$scpFileName = "$outDir/wav.scp";
+open (SCP, "| sort -u >  $scpFileName") || die "$0 ERROR: Unable to write wav.scp file $scpFileName\n";
+my $binary=$ENV{SPH2PIPE}
+$SPHBINARY ="$binary -f wav -p -c 1";
+my $SOXBINARY =`which sox` or die "Could not find the sph2pipe command"; chomp $SOXBINARY;
+$SOXFLAGS ="-r 8000 -c 1 -b 16 -t wav - downsample";
+
+$spk2uttFileName = "$outDir/spk2utt";
+open (SPK2UTT, "> $spk2uttFileName") || die "$0 ERROR: Unable to write spk2utt file $spk2uttFileName\n";
+
+$oovFileName = "$outDir/oovCounts";
+open (OOV, "| sort -nrk2 > $oovFileName") || die "$0 ERROR: Unable to write oov file $oovFileName\n";
+
+$numUtterances = $numSpeakers = $numWaveforms = 0;
+$totalSpeech = $totalSpeechSq = 0.0;
+foreach $utteranceID (sort keys %transcription) {
+    $fileID = $baseFileID{$utteranceID};
+    if (exists $waveformName{$fileID}) {
+        # There are matching transcriptions and audio
+        $numUtterances++;
+        $totalSpeech += ($endTime{$utteranceID} - $startTime{$utteranceID});
+        $totalSpeechSq += (($endTime{$utteranceID} - $startTime{$utteranceID})
+               *($endTime{$utteranceID} - $startTime{$utteranceID}));
+        print TEXT ("$utteranceID $transcription{$utteranceID}\n");
+        print UTT2SPK ("$utteranceID $speakerID{$utteranceID}\n");
+        print SEGMENTS ("$utteranceID $fileID $startTime{$utteranceID} $endTime{$utteranceID}\n");
+        if (exists $uttList{$speakerID{$utteranceID}}) {
+            $uttList{$speakerID{$utteranceID}} .= " $utteranceID";
+        } else {
+            $numSpeakers++;
+            $uttList{$speakerID{$utteranceID}} = "$utteranceID";
+        }
+        next if (exists $scpEntry{$fileID});
+        $numWaveforms++;
+        if ($waveformName{$fileID} =~ /.*\.sph/ ) {
+          $scpEntry{$fileID} = "$SPHBINARY $waveformName{$fileID} |";
+        } else {
+          $scpEntry{$fileID} = "$SOXBINARY $waveformName{$fileID} $SOXFLAGS |";
+        }
+    } else {
+        print STDERR ("$0 WARNING: No audio file for transcription $utteranceID\n");
+    }
+}
+foreach $fileID (sort keys %scpEntry) {
+    print SCP ("$fileID $scpEntry{$fileID}\n");
+}
+foreach $speakerID (sort keys %uttList) {
+    print SPK2UTT ("$speakerID $uttList{$speakerID}\n");
+}
+foreach $w (sort keys %oovCount) {
+    print OOV ("$w\t$oovCount{$w}\n");
+}
+exit(1) unless (close(TEXT) && close(UTT2SPK) && close(SEGMENTS) && close(SCP) && close(SPK2UTT) && close(OOV));
+
+print STDERR ("$0: Summary\n");
+print STDERR ("\tWrote $numUtterances lines each to text, utt2spk and segments\n");
+print STDERR ("\tWrote $numWaveforms lines to wav.scp\n");
+print STDERR ("\tWrote $numSpeakers lines to spk2utt\n");
+print STDERR ("\tHmmm ... $numSpeakers distinct speakers in this corpus? Unusual!\n")
+    if (($numSpeakers<($numUtterances/500.0)) || ($numSpeakers>($numUtterances/2.0)));
+print STDERR ("\tTotal # words = $numWords (including $numOOV OOVs) + $numSilence $silence\n")
+    if ($vocabFile);
+printf STDERR ("\tAmount of speech = %.2f hours (including some due to $silence)\n", $totalSpeech/3600.0);
+if ($numUtterances>0) {
+    printf STDERR ("\tAverage utterance length = %.2f sec +/- %.2f sec, and %.2f words\n",
+           $totalSpeech /= $numUtterances,
+           sqrt(($totalSpeechSq/$numUtterances)-($totalSpeech*$totalSpeech)),
+           $numWords/$numUtterances);
+}
+
+exit(0);
+
+########################################################################
+# Done!
+########################################################################
--- a/examples/openasr2021/s0/local/prepare_data.sh
+++ b/examples/openasr2021/s0/local/prepare_data.sh
+#!/bin/bash
+
+# Copyright 2018 Johns Hopkins University (Matthew Wiesner)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This is not necessarily the top-level run.sh as it is in other directories.   see README.txt first.
+
+. ./conf/lang.conf
+. ./path.sh
+. ./cmd.sh
+
+sph2pipe_version="v2.5"
+if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
+  echo "Download sph2pipe_${sph2pipe_version} ......"
+  wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
+  wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
+  tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
+  cd tools/sph2pipe_${sph2pipe_version}/ && \
+        gcc -o sph2pipe  *.c -lm
+  cd -
+fi
+sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
+[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
+export SPH2PIPE=$sph2pipe
+sox=`which sox`
+[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
+
+FLP=false
+
+. ./utils/parse_options.sh
+if [ $# -ne 1 ]; then
+  echo >&2 "Usage: ./local/prepare_data.sh [opts] <lang_id>"
+  echo >&2 "       --FLP : Use FLP training data (instead of LLP ~10h)"
+  exit 1
+fi
+
+l=$1
+
+l_suffix=${l}
+if $FLP; then
+  l_suffix=${l_suffix}_FLP
+fi
+
+#Preparing train directories
+if [ ! -f data/raw_train_data/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the TRAIN set"
+    echo ---------------------------------------------------------------------
+    train_data_dir=train_data_dir_${l_suffix}
+    train_data_list=train_data_list_${l_suffix}
+    local/make_corpus_subset.sh "${!train_data_dir}" "${!train_data_list}" ./data/raw_train_data
+    train_data_dir=`utils/make_absolute.sh ./data/raw_train_data`
+    touch data/raw_train_data/.done
+fi
+
+#exit 0
+
+#Preparing dev10 directories
+if [ ! -f data/raw_dev10h_data/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the Dev set"
+    echo ---------------------------------------------------------------------
+    dev10h_data_dir=dev10h_data_dir_${l}
+    dev10h_data_list=dev10h_data_list_${l}
+    local/make_corpus_subset.sh "${!dev10h_data_dir}" "${!dev10h_data_list}" ./data/raw_dev10h_data
+    dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data`
+    touch data/raw_dev10h_data/.done
+fi
+
+dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data`
+train_data_dir=`utils/make_absolute.sh ./data/raw_train_data`
+lexicon_file=lexicon_file_${l_suffix}
+
+if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing acoustic training lists in data/train on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/train.tmp
+  local/prepare_acoustic_training_data.pl \
+    --fragmentMarkers \-\*\~ \
+    $train_data_dir data/train.tmp > data/train.tmp/skipped_utts.log
+fi
+
+if [[ ! -f data/dev10h.pem/wav.scp || data/dev10h.pem/wav.scp -ot "$dev10h_data_dir" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing acoustic training lists in data/train on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/dev10h.pem
+  local/prepare_acoustic_training_data.pl \
+    --fragmentMarkers \-\*\~ \
+    $dev10h_data_dir data/dev10h.pem > data/dev10h.pem/skipped_utts.log
+fi
+
+
+###########################################################################
+# Prepend language ID to all utterances to disambiguate between speakers
+# of different languages sharing the same speaker id.
+#
+# The individual lang directories can be used for alignments, while a
+# combined directory will be used for training. This probably has minimal
+# impact on performance as only words repeated across languages will pose
+# problems and even amongst these, the main concern is the <hes> marker.
+###########################################################################
+
+num_utts=$(cat data/train.tmp/segments | wc -l)
+dev_utts=$((num_utts / 10))
+
+./utils/subset_data_dir.sh data/train.tmp ${dev_utts} data/train_dev
+
+awk '{print $1}' data/train_dev/utt2spk > data/train_dev.list
+awk '{print $1}' data/train.tmp/utt2spk | grep -vf data/train_dev.list > data/train.list
+
+./utils/subset_data_dir.sh --utt-list data/train.list data/train.tmp data/train
+
+echo "Prepend ${l} to data dir"
+./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
+  data/train data/train_${l}
+
+./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
+  data/train_dev data/dev_${l}
+
+./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \
+  data/dev10h.pem data/eval_${l}
+
--- a/examples/openasr2021/s0/local/setup_languages.sh
+++ b/examples/openasr2021/s0/local/setup_languages.sh
+#!/bin/bash
+
+# Copyright 2018 Johns Hopkins University (Matthew Wiesner)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+. ./cmd.sh
+. ./conf/lang.conf
+
+#langs="101 102 103 104 105 106 202 203 204 205 206 207 301 302 303 304 305 306 401 402 403"
+langs="101"
+recog="101"
+FLP=false
+garbage_utterance_tags="<silence> <v-noise> <noise> <unk> <hes>"
+
+. ./tools/parse_options.sh
+
+set -e
+set -o pipefail
+
+all_langs=""
+for l in `cat <(echo ${langs}) <(echo ${recog}) | tr " " "\n" | sort -u`; do
+  all_langs="${l} ${all_langs}"
+done
+all_langs=${all_langs%% }
+
+# Save top-level directory
+cwd=$(local/make_absolute.sh `pwd`)
+echo "Stage 0: Setup Language Specific Directories"
+echo "cwd"
+echo $cwd
+
+echo " --------------------------------------------"
+echo "Languagues: ${all_langs}"
+
+# Basic directory prep
+for l in ${all_langs}; do
+  [ -d data/${l} ] || mkdir -p data/${l}
+  cd data/${l}
+
+  ln -sf ${cwd}/local .
+  for f in ${cwd}/{tools,conf}; do
+    link=`make_absolute.sh $f`
+    ln -sf $link .
+  done
+
+  cp ${cwd}/cmd.sh .
+  cp ${cwd}/path.sh .
+  sed -i 's/\.\.\/\.\.\/\.\./\.\.\/\.\.\/\.\.\/\.\.\/\.\./g' path.sh
+  cd ${cwd}
+done
+
+# Prepare language specific data
+for l in ${all_langs}; do
+  (
+    cd data/${l}
+    ./local/prepare_data.sh --FLP ${FLP} ${l}
+    cd ${cwd}
+  ) &
+done
+wait
+
+# Combine all language specific training directories and generate a single
+# lang directory by combining all language specific dictionaries
+train_dirs=""
+dev_dirs=""
+eval_dirs=""
+for l in ${langs}; do
+  train_dirs="data/${l}/data/train_${l} ${train_dirs}"
+done
+
+for l in ${recog}; do
+  dev_dirs="data/${l}/data/dev_${l} ${dev_dirs}"
+done
+
+./tools/combine_data.sh data/train ${train_dirs}
+./tools/combine_data.sh data/dev ${dev_dirs}
+
+for l in ${recog}; do
+  ln -s ${cwd}/data/${l}/data/eval_${l} ${cwd}/data/eval_${l}
+done
+
+
+# Delete utterances with garbage meta tags
+for tag in $garbage_utterance_tags; do
+   sed -i "s/${tag}//g" data/train/text
+   sed -i "s/${tag}//g" data/dev/text
+   sed -i "s/${tag}//g" data/eval_${l}/text
+done
+
+sed -i "/_.*[0-9][ ]*$/d" data/train/text
+sed -i "/_.*[0-9][ ]*$/d" data/dev/text
+sed -i "/_.*[0-9][ ]*$/d" data/eval_${l}/text
+sed -i 's/[ ][ ]*/ /g' data/train/text
+sed -i 's/[ ][ ]*/ /g' data/dev/text
+sed -i 's/[ ][ ]*/ /g' data/eval_${l}/text
+
+./tools/fix_data_dir.sh data/train
+./tools/fix_data_dir.sh data/dev
+./tools/fix_data_dir.sh data/eval_${l}
+
--- a/examples/openasr2021/s0/path.sh
+++ b/examples/openasr2021/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/openasr2021/s0/run.sh
+++ b/examples/openasr2021/s0/run.sh
+#!/bin/bash
+# Copyright 2021 Tencent Inc. (Author: Kai Tang).
+# Apach 2.0
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+# data
+data=data
+data_url=www.openslr.org/resources/33
+nj=4
+
+#langid: 101 Cantonese , 302 Kazakh , 401 mongolian
+langs="101"
+recog="101"
+
+token_type=char
+# bpemode (unigram or bpe)
+nbpe=4500
+bpemode=unigram
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+if [ "${token_type}" = bpe ]; then
+    dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+    bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+elif [ "${token_type}" = char ]; then
+    dict=data/lang_char/lang_char.txt
+    bpe_model=
+else
+    echo "Error: not supported token_type"
+    exit 0
+fi
+
+train_set=train_sp
+train_dev=dev
+recog_set=eval_$recog
+
+# pretrained w2v-conformer encoder
+enc_init=pretrain/conformer.pt
+#reinit last pretrained encoder layer: https://arxiv.org/pdf/2107.04734.pdf
+enc_init_mods='encoder.encoders.0,encoder.encoders.1,encoder.encoders.2,encoder.encoders.3,encoder.encoders.4,encoder.encoders.5,encoder.encoders.6,encoder.encoders.7,encoder.encoders.8,encoder.encoders.9,encoder.encoders.10,encoder.encoders.11,encoder.encoders.12,encoder.encoders.13,encoder.encoders.14,encoder.encoders.15,encoder.encoders.16,encoder.encoders.17,encoder.encoders.18,encoder.encoders.19,encoder.encoders.20,encoder.encoders.21,encoder.encoders.22,encoder.embed'
+
+train_config=conf/train_conformer_large_10h.yaml
+checkpoint=
+cmvn=false
+dir=exp/${langs}_finetune_10h
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=35
+
+. utils/parse_options.sh || exit 1;
+
+#Babel style data preparation
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  echo "stage 0: Setting up individual languages"
+  ./local/setup_languages.sh --langs "${langs}" --recog "${recog}"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # Data preparation
+    for x in ${train_set} ${train_dev} ${recog_set}; do
+        # Remove the space in text
+        if [ "${token_type}" = char ]; then
+            cp data/${x}/text data/${x}/text.org
+            paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
+            > data/${x}/text
+            rm data/${x}/text.org
+        fi
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # Make train dict
+    echo "Make a dictionary"
+    mkdir -p $(dirname $dict)
+
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+    if [ "${token_type}" = bpe ]; then
+        # we borrowed these code and scripts which are related bpe from ESPnet.
+        cut -f 2- -d" " data/${train_set}/text | sort  > data/lang_char/input.txt
+        tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+        tools/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    elif [ "${token_type}" = char ]; then
+        tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
+    fi
+
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 1: format scp "
+  #dumps such pipe-style-wav to real audio file
+  for x in ${train_set} ${train_dev} ${recog_set}; do
+    cp data/${x}/wav.scp data/${x}/wav.scp.org
+    bash local/dump_wav.sh --nj 26 data/$x/wav.scp.org data/$x/segments data/$x/wav.scp
+    rm  data/$x/wav.scp.org
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "Prepare data, prepare required format"
+  # For wav feature, just copy the data. mfcc/fbank extraction is done in training
+  for x in ${train_set} ${train_dev} ${recog_set}; do
+    if [ $data_type == "shard" ]; then
+      tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data/$x/wav.scp data/$x/text \
+        $(realpath data/$x/shards) data/$x/data.list
+    else
+      tools/make_raw_list.py  data/$x/wav.scp data/$x/text \
+        data/$x/data.list
+    fi
+  done
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Training
+    mkdir -p $dir
+    INIT_FILE=$dir/ddp_init
+    rm -f $INIT_FILE # delete old one before starting
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+    # Use "nccl" if it works, otherwise use "gloo"
+    dist_backend="nccl"
+    cmvn_opts=
+    $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+    # train.py will write $train_config to $dir/train.yaml with model input
+    # and output dimension, train.yaml will be used for inference or model
+    # export later
+    for ((i = 0; i < $num_gpus; ++i)); do
+    {
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+        python wenet/bin/train.py --gpu $gpu_id \
+            --config $train_config \
+            --data_type $data_type \
+            --symbol_table $dict \
+            ${bpemodel:+--bpe_model ${bpemodel}.model} \
+            --train_data data/$train_set/data.list \
+            --cv_data data/$train_dev/data.list \
+            ${checkpoint:+--checkpoint $checkpoint} \
+            ${enc_init:+--enc_init $enc_init} \
+            --enc_init_mods $enc_init_mods \
+            --model_dir $dir \
+            --ddp.init_method $init_method \
+            --ddp.world_size $num_gpus \
+            --ddp.rank $i \
+            --ddp.dist_backend $dist_backend \
+            --num_workers 6 \
+            $cmvn_opts
+    } &
+    done
+    wait
+fi
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    cmvn_opts=
+    $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+    # TODO, Add model average here
+    mkdir -p $dir/test
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python  wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best
+    fi
+    # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+    # -1 for full chunk
+    decoding_chunk_size=
+    ctc_weight=0.5
+    for mode in ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring; do
+    for rtask in ${recog_set}; do
+    {
+        test_dir=$dir/test_${rtask}_${mode}
+        mkdir -p $test_dir
+        python  wenet/bin/recognize.py --gpu 0 \
+            --mode $mode \
+            --config $dir/train.yaml \
+            --data_type $data_type \
+            --test_data data/$rtask/data.list \
+            --checkpoint $decode_checkpoint \
+            --beam_size 5 \
+            --batch_size 1 \
+            --penalty 0.0 \
+            --dict $dict \
+            ${bpemodel:+--bpe_model ${bpemodel}.model} \
+            --ctc_weight $ctc_weight \
+            --result_file $test_dir/text_ori \
+            $cmvn_opts \
+            ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+         if [ "${token_type}" = bpe ]; then
+            tools/spm_decode --model=${bpemodel}.model --input_format=piece < $test_dir/text_ori | sed -e "s/▁/ /g" > $test_dir/text
+            python tools/compute-wer.py --char=0 --v=1 \
+            data/$rtask/text $test_dir/text > $test_dir/wer
+         elif [ "${token_type}" = char ]; then
+            python tools/compute-wer.py --char=1 --v=1 \
+            data/$rtask/text $test_dir/text_ori > $test_dir/wer
+         fi
+    } &
+    done
+    done
+    wait
+
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # Export the best model you want
+    python wenet/bin/export_jit.py \
+        --config $dir/train.yaml \
+        --checkpoint $dir/avg_${average_num}.pt \
+        --output_file $dir/final.zip
+fi
+
--- a/examples/openasr2021/s0/tools
+++ b/examples/openasr2021/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/openasr2021/s0/wenet
+++ b/examples/openasr2021/s0/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/swbd/s0/README.md
+++ b/examples/swbd/s0/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.001, warmup_steps 25000, batch size 16, 1 gpu, acc_grad 4, 240 epochs
+* Decoding info: average_num 10
+
+|      decoding mode     |   eval2000 (wer) |
+|:----------------------:|:----------------:|
+|   ctc_greedy_search    |       32.39%     |
+| ctc_prefix_beam_search |       32.39%     |
+|         attention      |       31.28%     |
+|  attention_rescoring   |       31.36%     |
\ No newline at end of file