add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/commonvoice/fr/run.sh
+++ b/examples/commonvoice/fr/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0     # start from 0 if you need to start from data download
+stop_stage=2
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+download_path=/root/autodl-tmp
+french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19
+# path to save preproecssed data
+# export data=data
+. ./path.sh
+. ./tools/parse_options.sh || exit 1
+
+nj=16
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+nbpe=5000
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=20
+#decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+decode_modes="attention attention_rescoring"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+
+    echo "stage -1: Data download"
+    echo "download Dataset!"
+    local/download_data.sh ${download_path} ${french_data}
+    echo "Finish stage 0"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+
+    echo "stage 0: Data preparation"
+    local/prepare_data.sh ${french_data}/fr
+    echo "Finish stage 0"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: compute global cmvn"
+    # compute cmvn
+    python tools/compute_cmvn_stats.py --num_workers 1 --train_config $train_config \
+        --in_scp data/${train_set}/wav.scp \
+        --out_cmvn data/${train_set}/global_cmvn
+    echo "Finish stage 1"
+fi
+
+
+bpemode=unigram
+dict=data/lang_char_/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char_/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char_/
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " data/${train_set}/text > data/lang_char_/input.txt
+  tools/spm_train --input=data/lang_char_/input.txt --vocab_size=${nbpe} \
+    --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model --output_format=piece \
+    < data/lang_char_/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Prepare data, prepare required format"
+  for x in dev test ${train_set}; do
+    if [ $data_type == "shard" ]; then
+      python tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data/$x/wav.scp data/$x/text \
+        $(realpath data/$x/shards) data/$x/data.list
+    else
+      python tools/make_raw_list.py data/$x/wav.scp data/$x/text \
+        data/$x/data.list
+    fi
+  done
+  echo "Finish stage 3"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  mkdir -p $dir
+  # You have to rm `INIT_FILE` manually when you resume or restart a
+  # multi-machine training.
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+
+  # train.py rewrite $train_config to $dir/train.yaml with model input
+  # and output dimension, and $dir/train.yaml will be used for inference
+  # and export.
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --bpe_model $bpemodel.model \
+      --train_data data/$train_set/data.list \
+      --cv_data data/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  # TODO, Add model average here
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+      decode_checkpoint=$dir/avg_${average_num}.pt
+      echo "do model average and final checkpoint is $decode_checkpoint"
+      python wenet/bin/average_model.py \
+          --dst_model $decode_checkpoint \
+          --src_path $dir  \
+          --num ${average_num} \
+          --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/test_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu 0 \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type "raw" \
+          --bpe_model $bpemodel.model \
+          --test_data data/test/data.list \
+          --checkpoint $decode_checkpoint \
+          --beam_size 20 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --dict $dict \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+
+         tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+           < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value
+        #sed -e "s/▁/ /g" $test_dir/text_bpe_value_tmp > $test_dir/text_value
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text
+        # a raw version wer without refining processs
+        python tools/compute-wer.py --char=1 --v=1 \
+          data/test/text $test_dir/text > $test_dir/wer
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
+
--- a/examples/commonvoice/fr/tools
+++ b/examples/commonvoice/fr/tools
+../../../tools/
\ No newline at end of file
--- a/examples/commonvoice/fr/wenet
+++ b/examples/commonvoice/fr/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/csj/s0/README.md
+++ b/examples/csj/s0/README.md
+# Performance Record
+
+## Conformer Result Bidecoder (large)
+
+
+## Conformer Result
+
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info: train_conformer.yaml, kernel size 15, lr 0.004, batch size 12, 8 gpu, acc_grad 1, 50 epochs, dither 0.0
+* Decoding info: ctc_weight 0.5, average_num 10
+
+
+| decoding mode                    | test1      | test2      | test3      |
+|----------------------------------|------------|------------|------------|
+| ctc greedy search                | 7.94       | 5.29       | 6.10       |
+| ctc prefix beam search           | 7.83+      | 5.28       | 6.08       |
+| attention decoder                | 7.83       | 5.63       | 6.37       |
+| attention rescoring              | 7.28+      | 4.81       | 5.44       |
+
+note that "+" means we removed two <0.1s wav files in test1 before decoding.
+
+
+
+
+## Conformer U2++ Result
+
+
+## Conformer U2 Result
+
--- a/examples/csj/s0/conf/train_conformer.yaml
+++ b/examples/csj/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        min_output_input_ratio: 0.05
+        max_output_input_ratio: 10.0
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 50
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.004
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/csj/s0/csj_tools/wn.0.parse.py
+++ b/examples/csj/s0/csj_tools/wn.0.parse.py
+
+# parse xml files and output simplified version
+
+import xml.dom.minidom
+import os
+import sys
+import multiprocessing
+
+def parsexml(afile, outpath):
+    outfile = os.path.join(outpath, afile.split('/')[-1] + '.simp')
+
+    with open(outfile, 'w') as bw:
+        domtree = xml.dom.minidom.parse(afile)
+        collection = domtree.documentElement
+        ipus = collection.getElementsByTagName('IPU')
+
+        for ipu in ipus:
+            starttime = 0
+            endtime = 0
+            if ipu.hasAttribute('IPUStartTime'):
+                starttime = ipu.getAttribute('IPUStartTime')
+            if ipu.hasAttribute('IPUEndTime'):
+                endtime = ipu.getAttribute('IPUEndTime')
+
+            # print('{}\t{}'.format(starttime, endtime))
+            #  ## original format ###
+            wlist = list()
+            plainwlist = list()
+            pronlist = list()
+
+            #  ## pronunciation ###
+            lemmalist = list()  # lemma list
+            dictlemmalist = list()  # dict lemma list
+            for suw in ipu.getElementsByTagName('SUW'):  # short unit word
+                txt = ''
+                plaintxt = ''
+                # PhoneticTranscription
+                prontxt = ''
+
+                if suw.hasAttribute('OrthographicTranscription'):
+                    txt = suw.getAttribute('OrthographicTranscription')
+                if suw.hasAttribute('PlainOrthographicTranscription'):
+                    plaintxt = suw.getAttribute('PlainOrthographicTranscription')
+                if suw.hasAttribute('PhoneticTranscription'):
+                    prontxt = suw.getAttribute('PhoneticTranscription')
+                wlist.append(txt)
+                plainwlist.append(plaintxt)
+                pronlist.append(prontxt)
+
+                lemma = ''
+                dictlemma = ''
+
+                if suw.hasAttribute('SUWLemma'):
+                    lemma = suw.getAttribute('SUWLemma')
+                if suw.hasAttribute('SUWDictionaryForm'):
+                    dictlemma = suw.getAttribute('SUWDictionaryForm')
+                lemmalist.append(lemma)
+                dictlemmalist.append(dictlemma)
+            txtsent = ' '.join(wlist)
+            plaintxtsent = ' '.join(plainwlist)
+            prontxtsent = ' '.join(pronlist)
+
+            lemmasent = ' '.join(lemmalist)
+            dictlemmasent = ' '.join(dictlemmalist)
+            outrow = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
+                starttime, endtime, txtsent, plaintxtsent,
+                prontxtsent, lemmasent, dictlemmasent)
+            bw.write(outrow)
+
+def procfolder_orig(apath, outpath):
+    count = 0
+    for afile in os.listdir(apath):
+        if not afile.endswith('.xml'):
+            continue
+        afile = os.path.join(apath, afile)
+        parsexml(afile, outpath)
+        count += 1
+        print('done: {} [{}]'.format(afile, count))
+
+def procfolder(apath, outpath):
+    # count = 0
+    fnlist = list()
+    for afile in os.listdir(apath):
+        if not afile.endswith('.xml'):
+            continue
+        fnlist.append(afile)
+    # now parallel processing:
+    nthreads = 16
+    for i in range(0, len(fnlist), nthreads):
+        # fnlist[i, i+16]
+        pool = multiprocessing.Pool(processes=nthreads)
+        for j in range(nthreads):
+            if i + j < len(fnlist):
+                afile = os.path.join(apath, fnlist[i + j])
+                pool.apply_async(parsexml, (afile, outpath))
+        pool.close()
+        pool.join()
+    print('parallel {} threads done for {} files in total.'.format(
+        nthreads, len(fnlist)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print("Usage: {} <in.csj.path> <out.csj.path>".format(sys.argv[0]))
+        exit(1)
+    # e.g., csjpath='/workspace/asr/csj/'
+    csjpath = sys.argv[1]
+    outcsjpath = sys.argv[2]
+
+    apath = os.path.join(csjpath, 'XML/BaseXML/core')
+    apath2 = os.path.join(csjpath, 'XML/BaseXML/noncore')
+
+    outapath = os.path.join(outcsjpath, 'xml')
+    # create the "outapath" dir:
+    if not os.path.exists(outapath):
+        os.mkdir(outapath)
+
+    # range over the following two folders:
+    procfolder(apath, outapath)
+    procfolder(apath2, outapath)
--- a/examples/csj/s0/csj_tools/wn.1.split_wav.py
+++ b/examples/csj/s0/csj_tools/wn.1.split_wav.py
+# based on xml.simp -> start_time and end_time -> split using sox
+
+import os
+import sys
+import multiprocessing
+
+import librosa
+import soundfile as sf
+
+# use .simp as the source for .wav file splitting
+def wavfn(apath):
+    wavdict = dict()  # key=id, value=full.path of .wav
+    for awavfn in os.listdir(apath):
+        fullwavpath = os.path.join(apath, awavfn)
+        aid = awavfn.replace('.wav', '')
+        wavdict[aid] = fullwavpath
+    return wavdict
+
+def xmlfn(apath):
+    xmldict = dict()  # key=id, value=full.path of .xml.simp
+    for axmlfn in os.listdir(apath):
+        if not axmlfn.endswith('.xml.simp'):
+            continue
+        axmlfn2 = os.path.join(apath, axmlfn)
+        aid = axmlfn.replace('.xml.simp', '')
+        # print('obtain id: {}\t{}'.format(axmlfn, aid))
+        xmldict[aid] = axmlfn2
+    return xmldict
+
+def ch2to1(f1, outf1):
+    wav1, _ = librosa.load(f1, sr=16000, mono=False)
+    if wav1.ndim == 1:
+        return
+    wav1mono = librosa.to_mono(wav1)
+    sf.write(outf1, wav1mono, 16000)
+    # print('2ch to 1ch, {} -> {}'.format(f1, outf1))
+    acmd = 'mv {} {}'.format(outf1, f1)
+    res = os.system(acmd)
+    # rename the .1ch file back to the .wav file and
+    # overwrite the old .wav file which is 2ch
+    # print(res, acmd)
+
+def proc1file(fullxmlfn, fullwavfn, outwavpath):
+    with open(fullxmlfn) as xmlbr:
+        for axmlline in xmlbr.readlines():
+            # start.time end.time ortho plainortho phonetic
+            axmlline = axmlline.strip()
+            cols = axmlline.split('\t')
+            stime = cols[0]
+            etime = cols[1]
+
+            if len(cols) == 2:
+                continue  # skip
+
+            basename = fullwavfn.split('/')[-1]
+
+            name2 = '{}_{}_{}.wav'.format(basename, stime, etime)
+            partwavfn = os.path.join(outwavpath, name2)
+
+            dur = float(etime) - float(stime)
+            acmd = 'sox {} {} trim {} {}'.format(fullwavfn, partwavfn, stime, dur)
+            res = os.system(acmd)
+            # print(res, acmd)
+
+            # perform 2ch to 1ch if necessary!
+            partwavfn1ch = partwavfn + ".1ch.wav"  # NOTE must ends with '.wav'!
+            # otherwise, soundfile.write will give us error report!
+            ch2to1(partwavfn, partwavfn1ch)
+
+def procpath(atag, csjpath, xmlsimppath, outwavpath, idset):
+    # atag = 'core' and 'noncore'
+    axmlpath = xmlsimppath
+    awavpath = os.path.join(csjpath, atag)
+
+    xmldict = xmlfn(axmlpath)
+    wavdict = wavfn(awavpath)
+
+    wavidlist = list(wavdict.keys())
+
+    # parallel processing
+    nthreads = 16
+    for i in range(0, len(wavidlist), nthreads):
+        pool = multiprocessing.Pool(processes=nthreads)
+        for j in range(nthreads):
+            if i + j < len(wavidlist):
+                wavid = wavidlist[i + j]
+                if len(idset) > 0 and wavid not in idset:
+                    # when idset is not empty, then only process the ids
+                    # that are included in idset:
+                    continue
+
+                fullwavfn = wavdict[wavid]
+                if wavid in xmldict:
+                    fullxmlfn = xmldict[wavid]
+                    pool.apply_async(proc1file, (fullxmlfn, fullwavfn, outwavpath))
+        pool.close()
+        pool.join()
+
+    print('parallel {} threads done for {} files.'.format(
+        nthreads,
+        len(wavidlist)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 4:
+        print(
+            "Usage: {}".format(sys.argv[0]) +
+            "<in.csj.path> <in.xml.simp.path> <out.wav.path> [id.list.fn]")
+        exit(1)
+
+    csjpath = sys.argv[1]
+    xmlsimppath = sys.argv[2]
+    outwavpath = sys.argv[3]
+    idlistfn = sys.argv[4] if len(sys.argv) == 5 else ""
+    idset = set()
+    if len(idlistfn) > 0:
+        with open(idlistfn) as br:
+            for aline in br.readlines():
+                aline = aline.strip()
+                idset.add(aline)
+    print(idset)
+
+    for atag in ['core', 'noncore']:
+        procpath(atag, csjpath, xmlsimppath, outwavpath, idset)
--- a/examples/csj/s0/csj_tools/wn.2.prep.text.py
+++ b/examples/csj/s0/csj_tools/wn.2.prep.text.py
+import os
+import sys
+
+# train test1 test2 test3
+
+def readtst(tstfn):
+    outlist = list()
+    with open(tstfn) as br:
+        for aline in br.readlines():
+            aline = aline.strip()
+            outlist.append(aline)
+    return outlist
+
+def split_train_tests_xml(xmlpath, test1fn, test2fn, test3fn):
+    test1list = readtst(test1fn)
+    test2list = readtst(test2fn)
+    test3list = readtst(test3fn)
+
+    outtrainlist = list()  # full path ".xml.simp" files
+    outt1list = list()  # test 1, full path ".xml.simp" files
+    outt2list = list()
+    outt3list = list()
+
+    for afile in os.listdir(xmlpath):
+        if not afile.endswith('.xml.simp'):
+            continue
+        afile2 = xmlpath + '/' + afile
+        aid = afile.split('.')[0]
+        if aid in test1list:
+            outt1list.append(afile2)
+        elif aid in test2list:
+            outt2list.append(afile2)
+        elif aid in test3list:
+            outt3list.append(afile2)
+        else:
+            outtrainlist.append(afile2)
+
+    return outtrainlist, outt1list, outt2list, outt3list
+
+def all_wavs(wavpath):
+    wavlist = list()
+    for afile in os.listdir(wavpath):
+        if not afile.endswith('.wav'):
+            continue
+        afile2 = wavpath + '/' + afile
+        wavlist.append(afile2)
+    return wavlist
+
+def gen_text(xmllist, outpath):
+    # id \t text
+    # e.g., /workspace/asr/wenet/examples/csj/s0/data/xml/S11M1689.xml.simp
+    # ID = S11M1689_stime_etime
+    outtxtfn = os.path.join(outpath, 'text')
+    with open(outtxtfn, 'w') as bw:
+        for xmlfn in xmllist:
+            aid = xmlfn.split('/')[-1]
+            aid2 = aid.split('.')[0]
+
+            with open(xmlfn) as br:
+                for aline in br.readlines():
+                    aline = aline.strip()
+                    # stime \t etime \t text1 \t text2 \t text3 \t text4 \t text5
+                    cols = aline.split('\t')
+                    # TODO different between "< 7" and "< 4"? strange
+                    # -> use "< 4", DO NOT use "< 7" !
+                    if len(cols) < 4:
+                        continue
+
+                    stime = cols[0]
+                    etime = cols[1]
+                    atxt = cols[3].replace(' ', '')
+
+                    afullid = '{}_{}_{}'.format(aid2, stime, etime)
+                    aoutline = '{}\t{}\n'.format(afullid, atxt)
+                    bw.write(aoutline)
+
+def parse_xml_set(xmllist):
+    outset = set()
+    for xml in xmllist:
+        aid = xml.split('/')[-1]
+        aid2 = aid.split('.')[0]
+        outset.add(aid2)
+    return outset
+
+def gen_wav_scp(xmllist, wavlist, outpath):
+    # xmlset = pure id set, alike 'S04F1228'
+    # can be from train, test1, test2, or test3
+    xmlset = parse_xml_set(xmllist)
+
+    outwavscpfn = os.path.join(outpath, 'wav.scp')
+    with open(outwavscpfn, 'w') as bw:
+        for wav in wavlist:
+            # wav is alike "/workspace/asr/wenet/examples/csj/s0/data
+            # /wav/S04F1228.wav_00458.875_00459.209.wav"
+            aid = wav.split('/')[-1]
+            cols = aid.split('_')
+
+            aid2 = cols[0].split('.')[0]
+            if aid2 not in xmlset:
+                continue
+
+            stime = cols[1]
+            etime = cols[2].replace('.wav', '')
+
+            afullid = '{}_{}_{}'.format(aid2, stime, etime)
+
+            wavabspath = os.path.abspath(wav)
+            aoutline = '{}\t{}\n'.format(afullid, wavabspath)
+            bw.write(aoutline)
+
+
+def prep_text_wavscp(
+        xmlpath, wavpath, test1fn, test2fn, test3fn,
+        outtrainpath, out1path, out2path, out3path):
+
+    trainlist, t1list, t2list, t3list = split_train_tests_xml(
+        xmlpath,
+        test1fn,
+        test2fn,
+        test3fn)
+    wavlist = all_wavs(wavpath)
+
+    gen_text(trainlist, outtrainpath)
+    gen_text(t1list, out1path)
+    gen_text(t2list, out2path)
+    gen_text(t3list, out3path)
+
+    gen_wav_scp(trainlist, wavlist, outtrainpath)
+    gen_wav_scp(t1list, wavlist, out1path)
+    gen_wav_scp(t2list, wavlist, out2path)
+    gen_wav_scp(t3list, wavlist, out3path)
+
+if __name__ == '__main__':
+    if len(sys.argv) < 10:
+        print(
+            "Usage: {}".format(sys.argv[0]) + "<xmlpath> " +
+            "<wavpath> <test1fn> <test2fn> <test3fn> " +
+            "<outtrainpath> <out1path> <out2path> <out3path>")
+        exit(1)
+
+    xmlpath = sys.argv[1]
+    wavpath = sys.argv[2]
+    test1fn = sys.argv[3]
+    test2fn = sys.argv[4]
+    test3fn = sys.argv[5]
+
+    outtrainpath = sys.argv[6]
+    out1path = sys.argv[7]
+    out2path = sys.argv[8]
+    out3path = sys.argv[9]
+
+    prep_text_wavscp(xmlpath, wavpath, test1fn,
+                     test2fn, test3fn, outtrainpath,
+                     out1path, out2path, out3path)
--- a/examples/csj/s0/csj_tools/wn.3.mincut.py
+++ b/examples/csj/s0/csj_tools/wn.3.mincut.py
+import librosa
+# import os
+import sys
+
+def mincut(wavscpfn, minsec):
+    outfn = wavscpfn + "_" + str(minsec)
+
+    with open(outfn, 'w') as bw:
+        with open(wavscpfn) as br:
+            for aline in br.readlines():
+                aline = aline.strip()
+                afn = aline.split('\t')[1]
+                # print(afn)
+                dur = librosa.get_duration(filename=afn)
+                if dur >= minsec:
+                    bw.write(aline + '\n')
+
+# wn.3.mincut.py <wav.scp> <min.sec>
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('{} <in.wav.scp> <min.sec.cut>'.format(sys.argv[0]))
+        exit()
+
+    wavscpfn = sys.argv[1]
+    minsec = float(sys.argv[2])
+
+    mincut(wavscpfn, minsec)
--- a/examples/csj/s0/csj_tools/wn.4.make_raw_list.py
+++ b/examples/csj/s0/csj_tools/wn.4.make_raw_list.py
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--segments', default=None, help='segments file')
+    parser.add_argument('wav_file', help='wav file')
+    parser.add_argument('text_file', help='text file')
+    parser.add_argument('output_file', help='output list file')
+    args = parser.parse_args()
+
+    wav_table = {}
+    with open(args.wav_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_table[arr[0]] = arr[1]
+
+    if args.segments is not None:
+        segments_table = {}
+        with open(args.segments, 'r', encoding='utf8') as fin:
+            for line in fin:
+                arr = line.strip().split()
+                assert len(arr) == 4
+                segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
+
+    with open(args.text_file, 'r', encoding='utf8') as fin, \
+         open(args.output_file, 'w', encoding='utf8') as fout:
+        for line in fin:
+            arr = line.strip().split(maxsplit=1)
+            key = arr[0]
+            txt = arr[1] if len(arr) > 1 else ''
+            if args.segments is None:
+                # assert key in wav_table
+                if key in wav_table:
+                    wav = wav_table[key]
+                    line = dict(key=key, wav=wav, txt=txt)
+                else:
+                    line = None
+            else:
+                # assert key in segments_table
+                if key in segments_table:
+                    wav_key, start, end = segments_table[key]
+                    wav = wav_table[wav_key]
+                    line = dict(key=key, wav=wav, txt=txt, start=start, end=end)
+                else:
+                    line = None
+            if line:
+                json_line = json.dumps(line, ensure_ascii=False)
+                fout.write(json_line + '\n')
--- a/examples/csj/s0/list_files/2ch.id.list
+++ b/examples/csj/s0/list_files/2ch.id.list
+D01F0002
+D01F0003
+D01F0023
+D01F0030
+D01F0046
+D01F0049
+D01F0055
+D01F0057
+D01M0005
+D01M0009
+D01M0012
+D01M0019
+D01M0020
+D01M0042
+D01M0043
+D01M0047
+D02F0015
+D02F0018
+D02F0025
+D02F0027
+D02F0031
+D02F0032
+D02F0033
+D02F0054
+D02M0014
+D02M0016
+D02M0024
+D02M0026
+D02M0028
+D02M0035
+D02M0039
+D02M0051
+D03F0001
+D03F0006
+D03F0008
+D03F0034
+D03F0036
+D03F0040
+D03F0045
+D03F0058
+D03M0004
+D03M0007
+D03M0013
+D03M0017
+D03M0037
+D03M0038
+D03M0048
+D03M0053
+D04F0011
+D04F0022
+D04F0029
+D04F0044
+D04F0050
+D04M0010
+D04M0021
+D04M0041
+D04M0052
+D04M0056
--- a/examples/csj/s0/list_files/test.set.1.list
+++ b/examples/csj/s0/list_files/test.set.1.list
+A01M0097
+A04M0051
+A04M0121
+A03M0156
+A03M0112
+A01M0110
+A05M0011
+A03M0106
+A01M0137
+A04M0123
+
--- a/examples/csj/s0/list_files/test.set.123.list
+++ b/examples/csj/s0/list_files/test.set.123.list
+A01M0097
+A04M0051
+A04M0121
+A03M0156
+A03M0112
+A01M0110
+A05M0011
+A03M0106
+A01M0137
+A04M0123
+
+A01F0063
+A01M0056
+A06F0135
+A02M0012
+A06M0064
+A01M0141
+A01F0034
+A03M0016
+A03F0072
+A01F0001
+
+S00F0066
+S00M0213
+S00M0070
+S00M0008
+S01F0105
+S00F0148
+S00F0019
+S00M0112
+S00F0152
+S00M0079
+
--- a/examples/csj/s0/list_files/test.set.2.list
+++ b/examples/csj/s0/list_files/test.set.2.list
+A01F0063
+A01M0056
+A06F0135
+A02M0012
+A06M0064
+A01M0141
+A01F0034
+A03M0016
+A03F0072
+A01F0001
+
--- a/examples/csj/s0/list_files/test.set.3.list
+++ b/examples/csj/s0/list_files/test.set.3.list
+S00F0066
+S00M0213
+S00M0070
+S00M0008
+S01F0105
+S00F0148
+S00F0019
+S00M0112
+S00F0152
+S00M0079
+
--- a/examples/csj/s0/path.sh
+++ b/examples/csj/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/csj/s0/run.sh
+++ b/examples/csj/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+# 1. xml split by sentences
+# 2. wav split by xml.simp's guidance
+# 3. generate "text" and "wav.scp" files as required by wenet
+# 4. compute cmvn, better wav.len >= 0.1s, otherwise bug happens...
+# 5. sentence piece's bpe vocabulary
+# 6. make "data.list" files
+# 7. train -> 50 epochs
+
+stage=1 # train -> 50 epochs
+stop_stage=8 #
+
+# data
+#data_url=www.openslr.org/resources/12
+# TODO use your own data path
+datadir=/workspace/asr/csj
+
+# output wav data dir
+wave_data=data # wave file path
+# Optional train_config
+train_config=conf/train_conformer.yaml
+checkpoint=
+cmvn=true # cmvn is for mean, variance, frame_number statistics
+do_delta=false # not used...
+
+dir=exp/sp_spec_aug # model's dir (output dir)
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+# maybe you can try to adjust it if you can not get close results as README.md
+average_num=10
+decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
+
+. tools/parse_options.sh || exit 1;
+
+# bpemode (unigram or bpe)
+nbpe=4096 # TODO -> you can change this value to 5000, 100000 and so on
+bpemode=bpe #unigram # TODO -> you can use unigram and other methods
+
+set -e # if any line's exex result is not true, bash stops
+set -u # show the error line when stops (failed)
+set -o pipefail # return value of the whole bash = final line executed's result
+
+train_set=train
+dev_set=dev
+recog_set="test1 test2 test3"
+
+### CSJ data is not free!
+# buying URL: https://ccd.ninjal.ac.jp/csj/en/
+
+### data preparing - split xml by sentences ###
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  ### I did not check espnet nor kaldi for the pre-processing,
+  ### I developed my own ways. so, use at your own risks.
+  echo "stage 1: Data preparation -> xml preprocessing "
+  echo "  -> extract [start.time, end.time, text] from raw xml files"
+  python ./csj_tools/wn.0.parse.py $datadir ${wave_data}
+fi
+
+in_wav_path=$datadir/WAV
+xml_simp_path=${wave_data}/xml
+#wav_split_path=${wave_data}/wav.2
+wav_split_path=${wave_data}/wav
+mkdir -p ${wav_split_path}
+
+### data preparing - split wav by xml.simp's guidance ###
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "stage 2: Data preparation -> wav preprocessing "
+  echo "  -> split wav file by xml.simp's [start.time, end.time, text] format"
+  # in addition, 2ch to 1ch!
+
+  python ./csj_tools/wn.1.split_wav.py ${in_wav_path} ${xml_simp_path} ${wav_split_path}
+fi
+
+### data preparing - generate "text" and "wav.scp" files ###
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: prepare text and wav.scp for train/test1/test2/test3 from wav and xml folders"
+
+  t1fn='list_files/test.set.1.list'
+  t2fn='list_files/test.set.2.list'
+  t3fn='list_files/test.set.3.list'
+
+  outtrain=${wave_data}/train
+  outt1=${wave_data}/test1
+  outt2=${wave_data}/test2
+  outt3=${wave_data}/test3
+
+  mkdir -p $outtrain
+  mkdir -p $outt1
+  mkdir -p $outt2
+  mkdir -p $outt3
+
+  python ./csj_tools/wn.2.prep.text.py \
+    ${xml_simp_path} ${wav_split_path} \
+    $t1fn $t2fn $t3fn \
+    $outtrain $outt1 $outt2 $outt3
+fi
+
+minsec=0.1
+
+### compute static info: mean, variance, frame_num ###
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: Feature Generation"
+  # TODO if failed, then please make sure your wav files are all >= 0.1s ...
+
+  mkdir -p $wave_data/dev
+  # merge total dev data
+  for set in test1 test2 test3; do
+    for f in `ls $wave_data/$set`; do
+      cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
+    done
+  done
+
+  python ./csj_tools/wn.3.mincut.py $wave_data/$train_set/wav.scp $minsec
+
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp $wave_data/$train_set/wav.scp_$minsec \
+    --out_cmvn $wave_data/$train_set/global_cmvn
+fi
+
+### use sentence piece to construct subword vocabulary ###
+dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 5: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char/
+
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
+  tools/spm_train \
+    --input=$wave_data/lang_char/input.txt \
+    --vocab_size=${nbpe} \
+    --model_type=${bpemode} \
+    --model_prefix=${bpemodel} \
+    --input_sentence_size=100000000
+
+  tools/spm_encode \
+    --model=${bpemodel}.model \
+    --output_format=piece < $wave_data/lang_char/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Prepare wenet required data
+  echo "Prepare data, prepare required format"
+  for x in $train_set ; do
+    python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp_$minsec $wave_data/$x/text \
+        $wave_data/$x/data.list
+  done
+  for x in $dev_set ${recog_set} ; do
+    python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
+        $wave_data/$x/data.list
+  done
+fi
+
+### Training! ###
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type raw \
+      --symbol_table $dict \
+      --train_data $wave_data/$train_set/data.list \
+      --cv_data $wave_data/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $num_gpus \
+      --ddp.rank $i \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+### test model ###
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=-1
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for test in $recog_set; do
+    for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/${test}_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu $gpu_id \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type raw \
+          --test_data $wave_data/$test/data.list \
+          --checkpoint $decode_checkpoint \
+          --beam_size 10 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --dict $dict \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+        tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+          < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
+
+        python tools/compute-wer.py --char=1 --v=1 \
+          $wave_data/$test/text $test_dir/text > $test_dir/wer
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+  done
+  wait
+fi
+
--- a/examples/csj/s0/tools
+++ b/examples/csj/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/csj/s0/wenet
+++ b/examples/csj/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/gigaspeech/s0/README.md
+++ b/examples/gigaspeech/s0/README.md
+# GigaSpeech
+A Large, modern and evolving dataset for automatic speech recognition. More details about GigaSpeech can be found:  https://github.com/SpeechColab/GigaSpeech
+
+# Performance Record
+
+## Conformer bidecoder Result
+
+* Feature info: using fbank feature, dither 1.0, cmvn, 16k
+* Training info: conf/train_conformer_bidecoder.yaml, subsample 4, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 40 epochs
+* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10
+* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
+
+### test set gigaspeech scoring
+
+| SPKR      | # Snt |  # Wrd | Corr | Sub | Del | Ins | Err  | S.Err |
+|-----------|-------|--------|------|-----|-----|-----|------|-------|
+| Sum/Avg   | 19928 | 390656 | 91.4 | 6.4 | 2.2 | 2.0 | 10.6 | 63.1  |
+|  Mean     | 152.1 | 2982.1 | 91.4 | 6.3 | 2.3 | 1.7 | 10.3 | 63.7  |
+|  S.D.     | 142.2 | 2838.1 |  5.5 | 4.1 | 1.6 | 1.3 |  6.4 | 16.9  |
+| Median    | 108.0 | 2000.0 | 93.0 | 5.1 | 2.0 | 1.3 |  8.4 | 64.6  |
+
+### dev set gigaspeech scoring
+
+| SPKR      | # Snt |  # Wrd | Corr | Sub | Del | Ins | Err  | S.Err |
+|-----------|-------|--------|------|-----|-----|-----|------|-------|
+| Sum/Avg   | 5715  | 127790 | 92.1 | 5.8 | 2.1 | 2.8 | 10.7 |  69.9 |
+|  Mean     | 204.1 | 4563.9 | 92.9 | 5.2 | 1.9 | 2.0 |  9.1 |  69.4 |
+|  S.D.     | 269.7 | 4551.6 |  3.4 | 2.7 | 0.9 | 1.7 |  4.6 |  15.9 |
+| Median    | 151.5 | 3314.0 | 93.8 | 4.4 | 1.6 | 1.7 |  7.9 |  71.6 |
+
+## Conformer U2++ Result
+
+* Feature info: using fbank feature, dither 1.0, cmvn, 16k
+* Training info: conf/train_u2++_conformer.yaml, subsample 6, kernel size 31, lr 0.001, batch size 28, 8 gpu, acc_grad 1, 50 epochs
+* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10
+* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
+
+### test set gigaspeech scoring, full chunk (non-streaming)
+
+| SPKR      | # Snt |  # Wrd | Corr | Sub | Del | Ins | Err  | S.Err |
+|-----------|-------|--------|------|-----|-----|-----|------|-------|
+| Sum/Avg   | 19928 | 390656 | 90.7 | 6.8 | 2.6 | 2.0 | 11.3 |  66.9 |
+|  Mean     | 152.1 | 2982.1 | 90.6 | 6.8 | 2.7 | 1.6 | 11.1 |  67.1 |
+|  S.D.     | 142.2 | 2838.1 |  5.8 | 4.3 | 1.9 | 1.2 |  6.7 |  16.5 |
+| Median    | 108.0 | 2000.0 | 92.1 | 5.7 | 2.2 | 1.3 |  9.0 |  68.9 |
+
+### test set gigaspeech scoring, chunk 8 (latency range from 0 to 480ms)
+
+| SPKR      | # Snt |  # Wrd | Corr | Sub | Del | Ins | Err  | S.Err |
+|-----------|-------|--------|------|-----|-----|-----|------|-------|
+| Sum/Avg   | 19928 | 390656 | 89.6 | 7.5 | 2.9 | 2.0 | 12.5 |  70.1 |
+|  Mean     | 152.1 | 2982.1 | 89.3 | 7.6 | 3.1 | 1.7 | 12.4 |  70.6 |
+|  S.D.     | 142.2 | 2838.1 |  6.5 | 4.9 | 2.1 | 1.2 |  7.3 |  15.8 |
+| Median    | 108.0 | 2000.0 | 91.1 | 6.3 | 2.5 | 1.4 | 10.2 |  72.2 |
+
+## Conformer Result
+
+* Feature info: using fbank feature, dither 1.0, no cmvn, 48k
+* Training info: conf/train_conformer.yaml, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 30 epochs
+* Decoding info: ctc_weight 0.5, average_num 5
+* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
+
+### test set gigaspeech scoring
+
+| SPKR          | # Snt |  # Wrd | Corr | Sub | Del | Ins | Err  | S.Err |
+|---------------|-------|--------|------|-----|-----|-----|------|-------|
+| Sum/Avg       | 19930 | 390744 | 90.8 | 6.9 | 2.3 | 2.0 | 11.2 | 65.1  |
+| Mean          | 152.1 | 2982.8 | 90.6 | 6.9 | 2.5 | 1.7 | 11.1 | 65.7  |
+| S.D.          | 142.3 | 2839.0 |  5.8 | 4.3 | 1.7 | 1.2 |  6.7 | 16.6  |
+| Median        | 108.0 | 2000.0 | 92.5 | 5.6 | 2.1 | 1.3 |  9.1 | 65.9  |