add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/multi_cn/s0/README.md
+++ b/examples/multi_cn/s0/README.md
+# Performance Record
+
+This is a Chinese speech recognition recipe that trains on all Chinese corpora including:
+
+| Dataset    | Duration (Hours) |
+|------------|------------------|
+| Aidatatang | 140              |
+| Aishell    | 151              |
+| MagicData  | 712              |
+| Primewords | 99               |
+| ST-CMDS    | 110              |
+| THCHS-30   | 26               |
+| TAL-ASR    | 587              |
+| AISHELL2   | 1000             |
+
+## Unified Transformer Result
+
+### Data info:
+
+* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30.
+* Feature info: using fbank feature, with cmvn, no speed perturb.
+* Training info: lr 0.004, batch size 18, 3 machines, 3*8 = 24 GPUs, acc_grad 1, 220 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 30
+* Git hash: 013794572a55c7d0dbea23a66106ccf3e5d3b8d4
+
+### WER
+
+| Dataset    | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
+|------------|------------|-------------------|-------------------|------------------------|---------------------|
+| Aidatatang | full       | 4.23              | 5.82              | 5.82                   | 4.71                |
+|            | 16         | 4.59              | 6.99              | 6.99                   | 5.29                |
+| Aishell    | full       | 4.69              | 5.80              | 5.80                   | 4.64                |
+|            | 16         | 4.97              | 6.75              | 6.75                   | 5.37                |
+| MagicData  | full       | 2.86              | 4.01              | 4.00                   | 3.07                |
+|            | 16         | 3.10              | 5.02              | 5.02                   | 3.68                |
+| THCHS-30   | full       | 16.68             | 15.46             | 15.46                  | 14.38               |
+|            | 16         | 17.47             | 16.81             | 16.82                  | 15.63               |
+
+## Unified Conformer Result
+
+### Data info:
+
+* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30.
+* Feature info: using fbank feature, with cmvn, speed perturb.
+* Training info: lr 0.001, batch size 8, 1 machines, 1*8 = 8 GPUs, acc_grad 12, 60 epochs
+* Decoding info: ctc_weight 0.5, average_num 10
+* Git hash: 5bdf436e671ef4c696d1b039f29cc33109e072fa
+
+### WER
+
+| Dataset    | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
+|------------|------------|-------------------|-------------------|------------------------|---------------------|
+| Aidatatang | full       | 4.12              | 4.97              | 4.97                   | 4.22                |
+|            | 16         | 4.45              | 5.73              | 5.73                   | 4.75                |
+| Aishell    | full       | 4.49              | 5.07              | 5.05                   | 4.43                |
+|            | 16         | 4.77              | 5.77              | 5.77                   | 4.85                |
+| MagicData  | full       | 2.55              | 3.07              | 3.05                   | 2.59                |
+|            | 16         | 2.81              | 3.88              | 3.86                   | 3.08                |
+| THCHS-30   | full       | 13.55             | 13.75             | 13.76                  | 12.72               |
+|            | 16         | 13.78             | 15.10             | 15.08                  | 13.90               |
+
+## Unified Conformer Result
+
+### Data info:
+
+* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, THCHS-30, TAL-ASR, and AISHELL2.
+* Feature info: using fbank feature, dither=0, cmvn, speed perturb
+* Training info: lr 0.001, batch size 22, 4 GPUs, acc_grad 4, 120 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 10
+* Git hash: 66f30c197d00c59fdeda3bc8ada801f867b73f78
+
+### WER
+
+| Dataset    | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring |
+|------------|------------|-------------------|-------------------|------------------------|---------------------|
+| Aidatatang | full       | 3.22              | 4.00              | 4.01                   | 3.35                |
+|            | 16         | 3.50              | 4.63              | 4.63                   | 3.79                |
+| Aishell    | full       | 1.23              | 2.12              | 2.13                   | 1.42                |
+|            | 16         | 1.33              | 2.72              | 2.72                   | 1.72                |
+| MagicData  | full       | 2.38              | 3.07              | 3.05                   | 2.52                |
+|            | 16         | 2.66              | 3.80              | 3.78                   | 2.94                |
+| THCHS-30   | full       | 9.93              | 11.07             | 11.06                  | 10.16               |
+|            | 16         | 10.28             | 11.85             | 11.85                  | 10.81               |
+| AISHELL2   | full       | 5.25              | 5.81              | 5.79                   | 5.22                |
+|            | 16         | 5.48              | 6.48              | 6.50                   | 5.61                |
+| TAL-ASR    | full       | 9.54              | 10.35             | 10.28                  | 9.66                |
+|            | 16         | 10.04             | 11.43             | 11.39                  | 10.55               |
--- a/examples/multi_cn/s0/conf/train_960_unigram5000.model
+++ b/examples/multi_cn/s0/conf/train_960_unigram5000.model
--- a/examples/multi_cn/s0/conf/train_conformer.yaml
+++ b/examples/multi_cn/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/multi_cn/s0/conf/train_unified_conformer.yaml
+++ b/examples/multi_cn/s0/conf/train_unified_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 180
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/multi_cn/s0/conf/train_unified_transformer.yaml
+++ b/examples/multi_cn/s0/conf/train_unified_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 220
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.004
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/multi_cn/s0/local/aidatatang_data_prep.sh
+++ b/examples/multi_cn/s0/local/aidatatang_data_prep.sh
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/data_aidatatang_200zh data/aidatatang"
+  exit 1;
+fi
+
+aidatatang_audio_dir=$1/corpus
+aidatatang_text=$1/transcript/aidatatang_200_zh_transcript.txt
+data=$2
+
+train_dir=$data/local/train
+dev_dir=$data/local/dev
+test_dir=$data/local/test
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating aidatatang data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 237265 ] && \
+  echo Warning: expected 237265 data files, found $n
+
+grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  tools/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text | sed 's/Ａ/A/g' > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" T0055"$2}' > $dir/utt2spk
+  tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p $data/train $data/dev $data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+  cp $dev_dir/$f $data/dev/$f || exit 1;
+  cp $test_dir/$f $data/test/$f || exit 1;
+done
+
+# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+echo "$0: aidatatang_200zh data preparation succeeded"
+exit 0;
--- a/examples/multi_cn/s0/local/aidatatang_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/aidatatang_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: aidatatang_200zh."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="aidatatang_200zh"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="18756983399"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+dev_dir=$data/$part/corpus/dev
+test_dir=$data/$part/corpus/test
+train_dir=$data/$part/corpus/train
+if [ $part == "aidatatang_200zh" ]; then
+  for set in $dev_dir $test_dir $train_dir;do
+    cd $set
+    for wav in ./*.tar.gz; do
+      echo "Extracting wav from $wav"
+      tar -zxf $wav && rm $wav
+    done
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/examples/multi_cn/s0/local/aishell2_data_prep.sh
+++ b/examples/multi_cn/s0/local/aishell2_data_prep.sh
+#!/usr/bin/env bash
+# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
+#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
+# Apache 2.0
+
+# This script is copied from aishell2/s5/local/prepare_data.sh
+# but using difference word segmentation script.
+
+# transform raw AISHELL-2 data to kaldi format
+
+. ./path.sh || exit 1;
+
+tmp=
+dir=
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-data-dir> <output-dir>"
+  echo " $0 /export/AISHELL-2/iOS/train data/train"
+  exit 1;
+fi
+
+corpus=$1
+dir=$2
+tmp=$dir/tmp
+
+echo "prepare_data.sh: Preparing data in $corpus"
+
+mkdir -p $dir
+mkdir -p $tmp
+
+
+# corpus check
+if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
+  echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
+  exit 1;
+fi
+
+# validate utt-key list
+awk '{print "AISHELL2_"$1}' $corpus/wav.scp   > $tmp/wav_utt.list
+awk '{print "AISHELL2_"$1}' $corpus/trans.txt > $tmp/trans_utt.list
+tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
+
+# wav.scp
+awk -F'\t' -v path_prefix=$corpus '{printf("AISHELL2_%s %s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
+tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
+
+awk -F'\t' '{printf("AISHELL2_%s %s\n",$1,$2)}' $corpus/trans.txt > $tmp/tmp_trans.txt
+tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_trans.txt | sort -k 1 | uniq > $tmp/trans.txt
+
+# text has ' sed "s/'//g"
+dos2unix < $tmp/trans.txt | \
+  tools/filter_scp.pl -f 1 $tmp/utt.list - | \
+  sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \
+  sed 's/Ａ/A/g' | sed 's/Ｔ/T/g' | sed 's/Ｍ/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' \
+  > $tmp/text
+
+# utt2spk & spk2utt
+awk -F' ' '{print $2}' $tmp/wav.scp > $tmp/wav.list
+sed -e 's:\.wav::g' $tmp/wav.list | \
+  awk -F'/' '{i=NF-1;printf("AISHELL2_%s AISHELL2_%s\n",$NF,$i)}' > $tmp/tmp_utt2spk
+tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk
+tools/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt
+
+# copy prepared resources from tmp_dir to target dir
+mkdir -p $dir
+for f in wav.scp text spk2utt utt2spk; do
+  cp $tmp/$f $dir/$f || exit 1;
+done
+
+tools/validate_data_dir.sh --no-feats $dir || exit 1;
+echo "local/prepare_data.sh succeeded"
+exit 0;
--- a/examples/multi_cn/s0/local/aishell_data_prep.sh
+++ b/examples/multi_cn/s0/local/aishell_data_prep.sh
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/data_aishell data/aishell"
+  exit 1;
+fi
+
+aishell_audio_dir=$1/wav
+aishell_text=$1/transcript/aishell_transcript_v0.8.txt
+data=data/aishell
+
+train_dir=$data/local/train
+dev_dir=$data/local/dev
+test_dir=$data/local/test
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating aishell data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text | \
+    sed 's/ａ/a/g' | sed 's/ｂ/b/g' |\
+    sed 's/ｃ/c/g' | sed 's/ｋ/k/g' |\
+    sed 's/ｔ/t/g' > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" BAC009"$2}' > $dir/utt2spk
+  tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p $data/train $data/dev $data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+  cp $dev_dir/$f $data/dev/$f || exit 1;
+  cp $test_dir/$f $data/test/$f || exit 1;
+done
+
+# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/examples/multi_cn/s0/local/aishell_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/aishell_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/examples/multi_cn/s0/local/magicdata_badlist
+++ b/examples/multi_cn/s0/local/magicdata_badlist
+16_4013_20170819121429.wav
+18_1565_20170712000170.wav
--- a/examples/multi_cn/s0/local/magicdata_data_prep.sh
+++ b/examples/multi_cn/s0/local/magicdata_data_prep.sh
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/magicdata data/magicdata"
+  exit 1;
+fi
+
+corpus=$1
+data=$2
+
+if [ ! -d $corpus/train ] || [ ! -d $corpus/dev ] || [ ! -d $corpus/test ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating magicdata data folder ****"
+
+mkdir -p $data/{train,dev,test,tmp}
+
+# find wav audio file for train, dev and test resp.
+tmp_dir=$data/tmp
+find $corpus -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 609552 ] && \
+  echo Warning: expected 609552 data data files, found $n
+
+for x in train dev test; do
+  grep -i "/$x/" $tmp_dir/wav.flist > $data/$x/wav.flist || exit 1;
+  echo "Filtering data using found wav list and provided transcript for $x"
+  awk -F '.wav' '{print $1}' local/magicdata_badlist | tools/filter_scp.pl --exclude -f 1 - \
+    <(cat $data/$x/wav.flist|awk -F '/' '{print gensub(".wav", "", "g", $NF), $0}') \
+    > $data/$x/wav.scp
+  sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $2}' > $data/$x/utt2spk
+  sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $3}' |\
+    sed 's/！//g' | sed 's/？//g' |\
+    sed 's/，//g' | sed 's/－//g' |\
+    sed 's/：//g' | sed 's/；//g' |\
+    sed 's/　//g' | sed 's/。//g' |\
+    sed 's/`//g' | sed 's/,//g' |\
+    sed 's/://g' | sed 's/?//g' |\
+    sed 's/\///g' | sed 's/·//g' |\
+    sed 's/\"//g' | sed 's/“//g' |\
+    sed 's/”//g' | sed 's/\\//g' |\
+    sed 's/…//g' | sed "s///g" |\
+    sed 's/、//g' | sed "s///g" | sed 's/《//g' | sed 's/》//g' |\
+    sed 's/\[//g' | sed 's/\]//g' | sed 's/FIL//g' | sed 's/SPK//' |\
+    tr '[a-z]' '[A-Z]' |\
+    awk '{if (NF > 1) print $0;}' > $data/$x/text
+  for file in wav.scp utt2spk text; do
+    sort $data/$x/$file -o $data/$x/$file
+  done
+  tools/utt2spk_to_spk2utt.pl $data/$x/utt2spk > $data/$x/spk2utt
+done
+
+# rm -r $tmp_dir
+
+tools/fix_data_dir.sh $data/train || exit 1;
+tools/fix_data_dir.sh $data/dev || exit 1;
+tools/fix_data_dir.sh $data/test || exit 1;
--- a/examples/multi_cn/s0/local/magicdata_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/magicdata_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2019  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/68 train_set"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: train_set, dev_set, test_set."
+fi
+
+data=$1
+url=$2
+part=$3
+part1=`echo $part | sed s/_set//`
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it."
+  mkdir -p $data
+fi
+
+part_ok=false
+list="train_set dev_set test_set"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part1/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="52627842921 1035537823 2201936013"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/$part1/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
--- a/examples/multi_cn/s0/local/primewords_data_prep.sh
+++ b/examples/multi_cn/s0/local/primewords_data_prep.sh
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/primewords data/primewords"
+  exit 1;
+fi
+
+corpus=$1/primewords_md_2018_set1
+data=$2
+
+if [ ! -d $corpus/audio_files ] || [ ! -f $corpus/set1_transcript.json ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating primewords data folder ****"
+
+mkdir -p $data/train
+
+# find wav audio file for train
+
+find $corpus -iname "*.wav" > $data/wav.flist
+n=`cat $data/wav.flist | wc -l`
+[ $n -ne 50384 ] && \
+  echo Warning: expected 50384 data files, found $n
+
+echo "Filtering data using found wav list and provided transcript"
+local/primewords_parse_transcript.py $data/wav.flist $corpus/set1_transcript.json $data/train
+cat $data/train/transcripts.txt |\
+  awk '{if (NF > 1) print $0;}' > $data/train/text
+
+for file in wav.scp utt2spk text; do
+  sort $data/train/$file -o $data/train/$file
+done
+tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
+
+# rm -r $data/wav.flist
+
+tools/validate_data_dir.sh --no-feats $data/train || exit 1;
--- a/examples/multi_cn/s0/local/primewords_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/primewords_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+part=primewords_md_2018_set1
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="9057625192"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
--- a/examples/multi_cn/s0/local/primewords_parse_transcript.py
+++ b/examples/multi_cn/s0/local/primewords_parse_transcript.py
+#!/usr/bin/env python3
+import os
+import sys
+import json
+
+
+def main(argv):
+    fp = open(argv[1], encoding="utf-8")
+    js = json.load(fp)
+    fp.close()
+    metas = {}
+    for ele in js:
+        fname = ele['file']
+        metas[fname] = ele
+
+    fWavScp = open(os.path.join(argv[2], 'wav.scp'), 'w')
+    fText = open(os.path.join(
+        argv[2], 'transcripts.txt'), 'w', encoding="utf-8")
+    fUtt2Spk = open(os.path.join(argv[2], 'utt2spk'), 'w')
+    for line in open(argv[0]):
+        fpath = line.strip('\r\n')
+        wname = os.path.basename(fpath)
+        meta = metas[wname]
+        spkid = 'P' + meta['user_id']
+        uttid = spkid + '-' + meta['id']
+        fWavScp.write(uttid + ' ' + fpath + '\n')
+        fText.write(uttid + ' ' + meta['text'] + '\n')
+        fUtt2Spk.write(uttid + ' ' + spkid + '\n')
+    fWavScp.close()
+    fText.close()
+    fUtt2Spk.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/examples/multi_cn/s0/local/stcmds_data_prep.sh
+++ b/examples/multi_cn/s0/local/stcmds_data_prep.sh
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/stcmds data/stcmds"
+  exit 1;
+fi
+
+corpus=$1/ST-CMDS-20170001_1-OS
+data=$2
+
+if [ ! -d $corpus ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating ST-CMDS data folder ****"
+
+mkdir -p $data/train
+
+# find wav audio file for train
+
+find $corpus -iname "*.wav" > $data/wav.list
+n=`cat $data/wav.list | wc -l`
+[ $n -ne 102600 ] && \
+  echo Warning: expected 102600 data files, found $n
+
+cat $data/wav.list | awk -F'20170001' '{print $NF}' | awk -F'.' '{print $1}' > $data/utt.list
+cat $data/utt.list | awk '{print substr($1,1,6)}' > $data/spk.list
+while read line; do
+  tn=`dirname $line`/`basename $line .wav`.txt;
+  cat $tn; echo;
+done < $data/wav.list > $data/text.list
+
+paste -d' ' $data/utt.list $data/wav.list > $data/train/wav.scp
+paste -d' ' $data/utt.list $data/spk.list > $data/train/utt2spk
+paste -d' ' $data/utt.list $data/text.list |\
+  sed 's/，//g' |\
+  tr '[a-z]' '[A-Z]' |\
+  awk '{if (NF > 1) print $0;}' > $data/train/text
+
+for file in wav.scp utt2spk text; do
+  sort $data/train/$file -o $data/train/$file
+done
+
+tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
+
+# rm -r $data/{wav,utt,spk,text}.list
+
+tools/validate_data_dir.sh --no-feats $data/train || exit 1;
--- a/examples/multi_cn/s0/local/stcmds_download_and_untar.sh
+++ b/examples/multi_cn/s0/local/stcmds_download_and_untar.sh
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+part=ST-CMDS-20170001_1-OS
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="8231662593"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
--- a/examples/multi_cn/s0/local/tal_data_prep.sh
+++ b/examples/multi_cn/s0/local/tal_data_prep.sh
+#!/bin/bash
+
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/aisolution_data data/tal_asr"
+  exit 1;
+fi
+
+tal_audio_dir=$1/wav/
+tal_text=$1/transcript/transcript.txt
+data=$2
+
+train_dir=$data/local/train
+dev_dir=$data/local/dev
+test_dir=$data/local/test
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $tal_audio_dir ] || [ ! -f $tal_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating tal asr data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $tal_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 31747 ] && \
+  echo Warning: expected 31747 data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF, "TALASR"$(NF-1)"-"$NF}' > $dir/utt_uttid
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print "TALASR"$(NF-1)"-"$NF, "TALASR"$(NF-1)}' > $dir/utt2spk
+  paste -d ' ' <(awk '{print $2}' $dir/utt_uttid) $dir/wav.flist > $dir/wav.scp
+  tools/filter_scp.pl -f 1 $dir/utt.list $tal_text | \
+    sed 's/Ａ/A/g' | sed 's/#//g' | sed 's/=//g' | sed 's/、//g' | \
+    sed 's/，//g' | sed 's/？//g' | sed 's/。//g' | sed 's/[ ][ ]*$//g'\
+    > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  paste -d " " <(sort -u -k 1 $dir/utt_uttid | awk '{print $2}') \
+    <(sort -u -k 1 $dir/transcripts.txt | awk '{for(i=2;i<NF;i++) {printf($i" ")}printf($NF"\n") }') \
+    > $dir/text
+  tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p $data/train $data/dev $data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+  cp $dev_dir/$f $data/dev/$f || exit 1;
+  cp $test_dir/$f $data/test/$f || exit 1;
+done
+
+tools/fix_data_dir.sh $data/train || exit 1;
+tools/fix_data_dir.sh $data/dev || exit 1;
+tools/fix_data_dir.sh $data/test || exit 1;
+
+echo "$0: tal asr data preparation succeeded"
+exit 0;
--- a/examples/multi_cn/s0/local/tal_mix_data_prep.sh
+++ b/examples/multi_cn/s0/local/tal_mix_data_prep.sh
+#!/bin/bash
+
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/TAL_CSASR data/tal_mix"
+  exit 1;
+fi
+
+tal_mix_audio_dir=$1/cs_wav
+tal_mix_text=$1/label
+data=$2
+
+train_dir=$data/local/train
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $tal_mix_audio_dir ] || [ ! -f $tal_mix_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating tal mix data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $tal_mix_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 370000 ] && \
+  echo Warning: expected 370000 data files, found $n
+
+# rm -r $tmp_dir
+
+# Transcriptions preparation
+echo Preparing transcriptions
+sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list
+sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{printf("%s %s\n",$NF,$NF)}' > $train_dir/utt2spk
+paste -d' ' $train_dir/utt.list $tmp_dir/wav.flist > $train_dir/wav.scp
+cat $tal_mix_text  | grep -Ev '^\s*$' | awk '{if(NF>1) print $0}' > $train_dir/transcript.txt
+#cp $tal_mix_text $train_dir
+
+wc -l $train_dir/transcript.txt
+echo filtering
+tools/filter_scp.pl -f 1 $train_dir/utt.list $train_dir/transcript.txt | \
+  sed 's/Ａ/A/g' | sed 's/Ｃ/C/g' | sed 's/Ｄ/D/g' | sed 's/Ｇ/G/g' | \
+  sed 's/Ｈ/H/g' | sed 's/Ｕ/U/g' | sed 's/Ｙ/Y/g' | sed 's/ａ/a/g' | \
+  sed 's/Ｉ/I/g' | sed 's/#//g' | sed 's/=//g' | sed 's/；//g' | \
+  sed 's/，//g' | sed 's/？//g' | sed 's/。//g' | sed 's/\///g' | \
+  sed 's/！//g' | sed 's/!//g' | sed 's/\.//g' | sed 's/\?//g' | \
+  sed 's/：//g' | sed 's/,//g' | sed 's/\"//g' | sed 's/://g' | \
+  sed 's/@//g' | sed 's/-/ /g' | sed 's/、/ /g' | sed 's/~/ /g' | \
+  sed "s/‘/\'/g" | sed 's/Ｅ/E/g' | sed "s/’/\'/g" | sed 's/《//g' | sed 's/》//g' | \
+  sed "s/[ ][ ]*$//g" | sed "s/\[//g" | sed 's/、//g' > $train_dir/text
+tools/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt
+
+mkdir -p $data/train
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+done
+
+tools/fix_data_dir.sh $data/train || exit 1;
+
+echo "$0: tal mix data preparation succeeded"
+exit 0;