add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/gigaspeech/s0/conf/train_conformer.yaml
+++ b/examples/gigaspeech/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 100
+        token_max_length: 160
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 28
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 30
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 100000
--- a/examples/gigaspeech/s0/conf/train_conformer_bidecoder.yaml
+++ b/examples/gigaspeech/s0/conf/train_conformer_bidecoder.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    cnn_module_norm: 'layer_norm'
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    reverse_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 100
+        token_max_length: 160
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 20
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 50
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 100000
--- a/examples/gigaspeech/s0/conf/train_u2++_conformer.yaml
+++ b/examples/gigaspeech/s0/conf/train_u2++_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm'
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    reverse_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 100
+        token_max_length: 160
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 28
+
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 50
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 80000
--- a/examples/gigaspeech/s0/local/extract_meta.py
+++ b/examples/gigaspeech/s0/local/extract_meta.py
+#!/usr/bin/env python
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Corporation (Author: Di Wu)
+
+import sys
+import os
+import argparse
+import json
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+      This script is used to process raw json dataset of GigaSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """)
+    parser.add_argument('input_json', help="""Input json file of Gigaspeech""")
+    parser.add_argument('output_dir', help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    try:
+        with open(input_json, 'r') as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f'Failed to load input json file: {input_json}')
+    else:
+        if json_data['audios'] is not None:
+            with open(f'{output_dir}/text', 'w') as utt2text, \
+                 open(f'{output_dir}/segments', 'w') as segments, \
+                 open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
+                 open(f'{output_dir}/wav.scp', 'w') as wavscp, \
+                 open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
+                 open(f'{output_dir}/reco2dur', 'w') as reco2dur:
+                for long_audio in json_data['audios']:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio['path']))
+                        aid = long_audio['aid']
+                        segments_lists = long_audio['segments']
+                        duration = long_audio['duration']
+                        assert (os.path.exists(long_audio_path))
+                        assert ('opus' == long_audio['format'])
+                        assert (16000 == long_audio['sample_rate'])
+                    except AssertionError:
+                        print(f'Warning: {aid} something is wrong, maybe'
+                              'AssertionError, skipped')
+                        continue
+                    except Warning:
+                        print(f'Warning: {aid} something is wrong, maybe the'
+                              'error path: {long_audio_path}, skipped')
+                        continue
+                    else:
+                        wavscp.write(f'{aid}\t{long_audio_path}\n')
+                        reco2dur.write(f'{aid}\t{duration}\n')
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file['sid']
+                                start_time = segment_file['begin_time']
+                                end_time = segment_file['end_time']
+                                dur = end_time - start_time
+                                text = segment_file['text_tn']
+                                segment_subsets = segment_file["subsets"]
+                            except Warning:
+                                print(f'Warning: {segment_file} something is'
+                                      'wrong, skipped')
+                                continue
+                            else:
+                                utt2text.write(f'{sid}\t{text}\n')
+                                segments.write(
+                                    f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
+                                )
+                                utt2dur.write(f'{sid}\t{dur}\n')
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(
+                                    f'{sid}\t{segment_sub_names}\n')
+
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/gigaspeech/s0/local/gigaspeech_data_prep.sh
+++ b/examples/gigaspeech/s0/local/gigaspeech_data_prep.sh
+#!/usr/bin/env bash
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Corporation (Author: Di Wu)
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+garbage_utterance_tags="<SIL> <MUSIC> <NOISE> <OTHER>"
+punctuation_tags="<COMMA> <EXCLAMATIONPOINT> <PERIOD> <QUESTIONMARK>"
+train_subset=XL
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <gigaspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/"
+  echo ""
+  echo "This script takes the GigaSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --garbage-utterance-tags <tags>  # Tags for non-speech."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --punctuation-tags <tags>        # Tags for punctuations."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <XL|L|M|S|XS>     # Train subset to be created."
+  exit 1
+fi
+
+gigaspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [XL]="train_xl"
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [XS]="train_xs"
+  [DEV]="dev"
+  [TEST]="test")
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\
+    echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1;
+  [ ! -d $gigaspeech_dir/audio ] &&\
+    echo "$0: Please download $gigaspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+     $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Filter $corpus_dir/text"
+  # Delete utterances with garbage meta tags
+  for tag in $garbage_utterance_tags; do
+    sed -i "/${tag}/d" $corpus_dir/text
+  done
+
+  # Delete punctuations in utterances
+  for tag in $punctuation_tags; do
+    sed -i "s/${tag}//g" $corpus_dir/text
+  done
+
+  # Ensure space only appears once and utt is seprated with others by '\t'
+  sed -i 's/\t/ /g' $corpus_dir/text
+  sed -i 's/[ ][ ]*/ /g' $corpus_dir/text
+  sed -i 's/ /\t/' $corpus_dir/text
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Split data to train, dev and test"
+  # Split data to train, dev and test.
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    grep "{$label}" $corpus_dir/utt2subsets \
+      > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
--- a/examples/gigaspeech/s0/local/gigaspeech_scoring.py
+++ b/examples/gigaspeech/s0/local/gigaspeech_scoring.py
+#!/usr/bin/env python3
+import os
+import argparse
+
+conversational_filler = [
+    'UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER', 'OOF', 'HEE',
+    'ACH', 'EEE', 'EW'
+]
+unk_tags = ['<UNK>', '<unk>']
+gigaspeech_punctuations = [
+    '<COMMA>', '<PERIOD>', '<QUESTIONMARK>', '<EXCLAMATIONPOINT>'
+]
+gigaspeech_garbage_utterance_tags = ['<SIL>', '<NOISE>', '<MUSIC>', '<OTHER>']
+non_scoring_words = conversational_filler + unk_tags + \
+    gigaspeech_punctuations + gigaspeech_garbage_utterance_tags
+
+def asr_text_post_processing(text):
+    # 1. convert to uppercase
+    text = text.upper()
+
+    # 2. remove hyphen
+    #   "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART"
+    text = text.replace('-', ' ')
+
+    # 3. remove non-scoring words from evaluation
+    remaining_words = []
+    for word in text.split():
+        if word in non_scoring_words:
+            continue
+        remaining_words.append(word)
+
+    return ' '.join(remaining_words)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='''This script evaluates GigaSpeech ASR
+                     result via SCTK's tool sclite''')
+    parser.add_argument(
+        'ref',
+        type=str,
+        help="sclite's standard transcription(trn) reference file")
+    parser.add_argument(
+        'hyp',
+        type=str,
+        help="sclite's standard transcription(trn) hypothesis file")
+    parser.add_argument('work_dir', type=str, help='working dir')
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.work_dir):
+        os.mkdir(args.work_dir)
+
+    REF = os.path.join(args.work_dir, 'REF')
+    HYP = os.path.join(args.work_dir, 'HYP')
+    RESULT = os.path.join(args.work_dir, 'RESULT')
+
+    for io in [(args.ref, REF), (args.hyp, HYP)]:
+        with open(io[0],
+                  'r', encoding='utf8') as fi, open(io[1],
+                                                    'w+',
+                                                    encoding='utf8') as fo:
+            for line in fi:
+                line = line.strip()
+                if line:
+                    cols = line.split()
+                    text = asr_text_post_processing(' '.join(cols[0:-1]))
+                    uttid_field = cols[-1]
+                    print(F'{text} {uttid_field}', file=fo)
+
+    os.system(F'sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}'
+              )  # GigaSpeech's uttid comforms to swb
--- a/examples/gigaspeech/s0/path.sh
+++ b/examples/gigaspeech/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/gigaspeech/s0/run.sh
+++ b/examples/gigaspeech/s0/run.sh
+#!/bin/bash
+
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+# data
+# use your own data path, you can contact gigaspeech@speechcolab.orgfor getting data for data information about gigaspeech
+# the preparation of gigaspeech dataset for wenet can be found https://github.com/SpeechColab/GigaSpeech
+giga_data_dir=/export/expts6/corpus/data/en-asr-data/16k/GigaSpeech
+shards_dir=/ssd/nfs06/unified_data/giga_shards
+# gigaspeech training set
+set=XL
+train_set=train_`echo $set |tr 'A-Z' 'a-z'`
+train_dev=dev
+recog_set=test
+# wav data dir
+data=data
+nj=16
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard Conformer
+# 2. conf/train_transformer_bidecoder.yaml: Bidecoder Conformer
+train_config=conf/train_conformer_bidecoder.yaml
+checkpoint=
+cmvn=false
+do_delta=false
+dir=exp/sp_spec_aug
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+# maybe you can try to adjust it if you can not get close results as README.md
+average_num=3
+decode_modes="attention_rescoring ctc_greedy_search"
+
+. tools/parse_options.sh || exit 1;
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+set -e
+set -u
+set -o pipefail
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  ### Task dependent. You have to make data the following preparation part by yourself.
+  ### But you can utilize Kaldi recipes in most cases
+  echo "stage 0: Data preparation"
+  local/gigaspeech_data_prep.sh --train-subset $set --stage 1 $giga_data_dir $data
+  sed -i "s/\t/ /g" $data/${train_set}/text
+  sed -i "s/\t/ /g" $data/${train_dev}/text
+  sed -i "s/\t/ /g" $data/${recog_set}/text
+  for x in $train_dev $train_set $recog_set; do
+    paste -d " " <(cut -f1 -d " " $data/$x/text) <(cut -f1 -d " " $data/$x/text) > $data/$x/spk2utt
+    cp $data/$x/spk2utt $data/$x/utt2spk
+    tools/fix_data_dir.sh $data/$x
+  done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  ### Task dependent. You have to design training and dev sets by yourself.
+  echo "stage 1: generate segmented wav.scp and compute cmvn"
+  # the format of wav.segment.scp is:
+  # POD1000000004_S0000000 /GigaSpeech/audio/podcast/P0000/POD1000000004.opus,0.0,10.197
+  # 0.0 is start time, 10.197 is end time (second)
+  for x in $train_dev $train_set $recog_set; do
+    python tools/segment.py --segments $data/$x/segments \
+      --input $data/$x/wav.scp \
+      --output $data/$x/wav.segment.scp
+  done
+
+  # optional
+  # compute cmvn, perhaps you can sample some segmented examples fron wav.scp for cmvn computation
+  python tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp $data/$train_set/wav.segment.scp \
+    --out_cmvn $data/$train_set/global_cmvn
+fi
+
+
+dict=$data/lang_char_$set/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=$data/lang_char_$set/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p $data/lang_char_$set/
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " $data/${train_set}/text > $data/lang_char_$set/input.txt
+  tools/spm_train --input=$data/lang_char_$set/input.txt --vocab_size=${nbpe} \
+    --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model --output_format=piece \
+    < $data/lang_char_$set/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Making shards, please wait..."
+  RED='\033[0;31m'
+  NOCOLOR='\033[0m'
+  echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
+  echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
+
+  for x in $train_dev $train_set $recog_set; do
+    dst=$shards_dir/$x
+    mkdir -p $dst
+    tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
+      --num_threads 32 --segments data/$x/segments \
+      data/$x/wav.scp data/$x/text \
+      $(realpath $dst) data/$x/data.list
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="nccl"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type "shard" \
+      --symbol_table $dict \
+      --bpe_model $bpemodel.model \
+      --train_data $data/$train_set/data.list \
+      --cv_data $data/$train_dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 16 \
+      $cmvn_opts
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  # TODO, Add model average here
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+      decode_checkpoint=$dir/avg_${average_num}.pt
+      echo "do model average and final checkpoint is $decode_checkpoint"
+      python wenet/bin/average_model.py \
+          --dst_model $decode_checkpoint \
+          --src_path $dir  \
+          --num ${average_num} \
+          --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for test in $recog_set; do
+    for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/${test}_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu $gpu_id \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type "shard" \
+          --symbol_table $dict \
+          --bpe_model $bpemodel.model \
+          --test_data $data/$test/format.data \
+          --checkpoint $decode_checkpoint \
+          --beam_size 20 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --dict $dict \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+
+        tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+          < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text
+        # a raw version wer without refining processs
+        python tools/compute-wer.py --char=1 --v=1 \
+          $data/$test/text $test_dir/text > $test_dir/wer
+
+        # for gigaspeech scoring
+        cat $test_dir/text_bpe_key_tmp | sed -e "s/^/(/g" | sed -e "s/$/)/g" > $test_dir/hyp_key
+        paste -d " " $test_dir/text_value $test_dir/hyp_key > $test_dir/hyp
+        paste -d " " <(cut -f2- -d " " $data/$test/text) \
+          <(cut -f1 -d " " $data/$test/text | \
+          sed -e "s/^/(/g" | sed -e "s/$/)/g") > $data/$test/ref
+        local/gigaspeech_scoring.py $data/$test/ref $test_dir/hyp $test_dir
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
+
--- a/examples/gigaspeech/s0/tools
+++ b/examples/gigaspeech/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/gigaspeech/s0/wenet
+++ b/examples/gigaspeech/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/hkust/s0/README.md
+++ b/examples/hkust/s0/README.md
+# Performance Record
+
+## Conformer Result (Old IO)
+
+* Feature info: using fbank feature, with cmvn, with speed perturb.
+* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 30
+
+| decoding mode            |       |
+|--------------------------|-------|
+| attention decoder        | 21.9  |
+| ctc greedy search        | 21.15 |
+| ctc prefix beam search   | 21.13 |
+| attention rescoring      | 20.47 |
+
+## Conformer Result (New IO)
+
+* Feature info: using fbank feature, with cmvn, with speed perturb.
+* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 133 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 30
+
+| decoding mode            |       |
+|--------------------------|-------|
+| attention decoder        | 21.42 |
+| ctc greedy search        | 21.16 |
+| ctc prefix beam search   | 21.18 |
+| attention rescoring      | 20.42 |
--- a/examples/hkust/s0/conf/train_960_unigram5000.model
+++ b/examples/hkust/s0/conf/train_960_unigram5000.model
--- a/examples/hkust/s0/conf/train_conformer.yaml
+++ b/examples/hkust/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# feature extraction
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400
+        token_min_length: 1
+        max_output_input_ratio: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/hkust/s0/local/hkust_data_prep.sh
+++ b/examples/hkust/s0/local/hkust_data_prep.sh
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/corpora/LDC03S04 /export/corpora/LDC03T19"
+  exit 1;
+fi
+
+hkust_audio_dir=$1
+hkust_text_dir=$2
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+train_dev=train_dev
+train_nodev=train_nodev
+
+nj=16
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+
+#data directory check
+if [ ! -d $hkust_audio_dir ] || [ ! -d $hkust_text_dir ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+#find sph audio file for train dev resp.
+find $hkust_audio_dir -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist || exit 1;
+find $hkust_audio_dir -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist || exit 1;
+
+n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
+[ $n -ne 897 ] && \
+  echo Warning: expected 897 data data files, found $n
+
+#Transcriptions preparation
+
+#collect all trans, convert encodings to utf-8,
+find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
+  iconv -f GBK -t UTF-8 | perl -e '
+    while (<STDIN>) {
+      @A = split(" ", $_);
+      if (@A <= 1) { next; }
+      if ($A[0] eq "#") { $utt_id = $A[1]; }
+      if (@A >= 3) {
+        $A[2] =~ s:^([AB])\:$:$1:;
+        printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
+        for($n = 3; $n < @A; $n++) { print " $A[$n]" };
+        print "\n";
+      }
+    }
+  ' | sort -k1 > $train_dir/transcripts.txt || exit 1;
+
+find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
+  iconv -f GBK -t UTF-8 | perl -e '
+    while (<STDIN>) {
+      @A = split(" ", $_);
+      if (@A <= 1) { next; }
+      if ($A[0] eq "#") { $utt_id = $A[1]; }
+      if (@A >= 3) {
+        $A[2] =~ s:^([AB])\:$:$1:;
+        printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
+        for($n = 3; $n < @A; $n++) { print " $A[$n]" };
+        print "\n";
+      }
+    }
+  ' | sort -k1  > $dev_dir/transcripts.txt || exit 1;
+
+#transcripts normalization and segmentation
+cat $train_dir/transcripts.txt |\
+  sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
+  sed -e 's/<\/foreign>/ /g' |\
+  sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
+  sed -e 's/<\/noise>//g' |\
+  sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
+  sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
+  awk '{if (NF > 1) print $0;}' |\
+  local/hkust_normalize.pl |\
+  awk '{if (NF > 0) print $0;}' > $train_dir/text || exit 1;
+
+cat $dev_dir/transcripts.txt |\
+  sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
+  sed -e 's/<\/foreign>/ /g' |\
+  sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
+  sed -e 's/<\/noise>//g' |\
+  sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
+  sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
+  awk '{if (NF > 1) print $0;}' |\
+  local/hkust_normalize.pl |\
+  awk '{if (NF > 0) print $0;}' > $dev_dir/text || exit 1;
+
+# some data is corrupted. Delete them
+cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp
+mv tmp $train_dir/text || exit 1;
+
+#Make segment files from transcript
+#segments file format is: utt-id side-id start-time end-time, e.g.:
+#sw02001-A_000098-001156 sw02001-A 0.98 11.56
+
+awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
+   print segment " " audioname "-" side " " startf/100 " " endf/100}' <$train_dir/text > $train_dir/segments
+awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $train_dir/sph.flist > $train_dir/sph.scp
+
+awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
+   print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
+awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
+
+bash tools/sph2wav.sh --nj ${nj} $train_dir/sph.scp $train_dir/segments $train_dir/wav.scp
+bash tools/sph2wav.sh --nj ${nj} $dev_dir/sph.scp $dev_dir/segments $dev_dir/wav.scp
+
+#side A - channel 1, side B - channel 2
+
+# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
+# to the file name sw02001 and the A, e.g.
+# sw02001-A  sw02001 A
+# In this case it's trivial, but in other corpora the information might
+# be less obvious.  Later it will be needed for ctm scoring.
+cat $train_dir/wav_ori.scp | awk '{print $1}' | \
+  perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
+  > $train_dir/reco2file_and_channel || exit 1;
+cat $dev_dir/wav_ori.scp | awk '{print $1}' | \
+  perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
+  > $dev_dir/reco2file_and_channel || exit 1;
+
+
+cat $train_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $train_dir/utt2spk || exit 1;
+cat $train_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $train_dir/spk2utt || exit 1;
+
+cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir/utt2spk || exit 1;
+cat $dev_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
+
+mkdir -p data/train data/dev
+
+for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
+  cp data/local/train/$f data/train/$f || exit 1;
+done
+
+for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
+  cp data/local/dev/$f data/dev/$f || exit 1;
+done
+
+tools/subset_data_dir.sh --first data/train 4001 data/${train_dev}
+n=$(($(wc -l < data/train/segments) - 4001))
+tools/subset_data_dir.sh --last data/train ${n} data/${train_nodev}
+
+echo "$0: HKUST data preparation succeeded"
+exit 0
--- a/examples/hkust/s0/local/hkust_normalize.pl
+++ b/examples/hkust/s0/local/hkust_normalize.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright Chao Weng
+
+# normalizations for hkust trascript
+# see the docs/trans-guidelines.pdf for details
+
+while (<STDIN>) {
+  @A = split(" ", $_);
+  print "$A[0] ";
+  for ($n = 1; $n < @A; $n++) {
+    $a = $A[$n];
+    if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}")
+       || ($a eq "{lipsmack}")) {next;}
+    if (($a eq "{laugh}")) {next;}
+    if (($a eq "<noise>")) {next;}
+    $tmp = $a;
+    if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:g; }
+    if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; }
+    if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; }
+    if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);}
+    print "$tmp ";
+  }
+  print "\n";
+}
--- a/examples/hkust/s0/path.sh
+++ b/examples/hkust/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/hkust/s0/run.sh
+++ b/examples/hkust/s0/run.sh
+#!/bin/bash
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+stage=4 # start from 0 if you need to start from data preparation
+stop_stage=4
+
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+nj=16
+feat_dir=raw_wav
+
+data_type=raw
+num_utts_per_shard=1000
+prefetch=100
+
+train_set=train_nodev
+dev_set=train_dev
+
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+train_config=conf/train_conformer.yaml
+# English modeling unit
+# Optional 1. bpe 2. char
+en_modeling_unit=bpe
+dict=data/dict_$en_modeling_unit/lang_char.txt
+cmvn=true
+debug=false
+num_workers=2
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="ctc_greedy_search ctc_prefix_beam_search
+              attention attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/hkust_data_prep.sh /mnt/cfs/database/hkust/LDC2005S15/ \
+    /mnt/cfs/database/hkust/LDC2005T32/ || exit 1;
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # For wav feature, just copy the data. Fbank extraction is done in training
+  mkdir -p ${feat_dir}_${en_modeling_unit}
+  for x in ${train_set} ${dev_set}; do
+    cp -r data/$x ${feat_dir}_${en_modeling_unit}
+  done
+
+  cp -r data/dev ${feat_dir}_${en_modeling_unit}/test
+
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp data/${train_set}/wav.scp \
+    --out_cmvn ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn
+
+fi
+
+# This bpe model is trained on librispeech training data set.
+bpecode=conf/train_960_unigram5000.model
+trans_type_ops=
+bpe_ops=
+if [ $en_modeling_unit = "bpe" ]; then
+  trans_type_ops="--trans_type cn_char_en_bpe"
+  bpe_ops="--bpecode ${bpecode}"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # Make train dict
+  echo "Make a dictionary"
+  mkdir -p $(dirname $dict)
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  paste -d " " \
+    <(cut -f 1 -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text) \
+    <(cut -f 2- -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text \
+    | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
+    | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " " ) \
+    > ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict
+  sed -i 's/\xEF\xBB\xBF//' \
+    ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict
+
+  tools/text2token.py -s 1 -n 1 -m ${bpecode} \
+    ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict ${trans_type_ops} \
+    | cut -f 2- -d" " | tr " " "\n" \
+    | sort | uniq | grep -a -v -e '^\s*$' \
+    | grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \
+    | grep -v "…" \
+    | awk '{print $0 " " NR+1}' >> ${dict}
+
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # Prepare wenet required data
+  echo "Prepare data, prepare required format"
+  for x in ${dev_set} ${train_set} test; do
+    if [ $data_type == "shard" ]; then
+      tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 ${feat_dir}_${en_modeling_unit}/$x/wav.scp \
+        ${feat_dir}_${en_modeling_unit}/$x/text \
+        $(realpath ${feat_dir}_${en_modeling_unit}/$x/shards) \
+        ${feat_dir}_${en_modeling_unit}/$x/data.list
+    else
+      tools/make_raw_list.py ${feat_dir}_${en_modeling_unit}/$x/wav.scp \
+      ${feat_dir}_${en_modeling_unit}/$x/text \
+      ${feat_dir}_${en_modeling_unit}/$x/data.list
+    fi
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --prefetch $prefetch \
+      --train_data ${feat_dir}_${en_modeling_unit}/$train_set/data.list \
+      --cv_data ${feat_dir}_${en_modeling_unit}/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory \
+      --bpe_model ${bpecode}
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=-1
+  ctc_weight=0.5
+  idx=0
+  for mode in ${decode_modes}; do
+  {
+    test_dir="$dir/"`
+      `"test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size}/test"
+    mkdir -p $test_dir
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+    python wenet/bin/recognize.py --gpu $gpu_id \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data ${feat_dir}_${en_modeling_unit}/test/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --ctc_weight $ctc_weight \
+      --result_file $test_dir/text_${en_modeling_unit} \
+      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    if [ $en_modeling_unit == "bpe" ]; then
+      tools/spm_decode --model=${bpecode} --input_format=piece \
+      < $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" > $test_dir/text
+    else
+      cat $test_dir/text_${en_modeling_unit} \
+      | sed -e "s/▁/ /g" > $test_dir/text
+    fi
+    # Cer used to be consistent with kaldi & espnet
+    python tools/compute-cer.py --char=1 --v=1 \
+      ${feat_dir}_${en_modeling_unit}/test/text $test_dir/text > $test_dir/wer
+  } &
+  ((idx+=1))
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip \
+    --output_quant_file $dir/final_quant.zip
+fi
+
--- a/examples/hkust/s0/tools
+++ b/examples/hkust/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/hkust/s0/wenet
+++ b/examples/hkust/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/librispeech/rnnt/README.md
+++ b/examples/librispeech/rnnt/README.md
+# Performance Record
+
+## Conformer Bidecoder Transducer Result
+
+* Feature info: using fbank feature, dither, cmvn, online speed perturb
+* Training info: lr 0.001, dynamic batch with max_frames_in_batch 4000, 8 gpu, acc_grad 1, 60 epochs
+* Training weight info: transducer_weight 0.75,  ctc_weight 0.1, reverse_weight 0.30, average_num 10
+* Predictor type: lstm
+
+| decoding mode         | dev_clean  | dev_other | test_clean | test_other |
+|-----------------------|------------|-----------|------------|------------|
+| rnnt_greedy_search    | 3.42%      | 8.99%     |    3.56%   |   9.15%    |
+| rnnt_beam_search      | 3.35%      | 8.77%     |    3.45%   |   8.78%    |
+| rnnt_beam_att_rescore | 3.25%      | 8.66%     |    3.41%   |   8.68%    |
+
+Pretrained model: https://huggingface.co/yuekai/wenet-asr-librispeech-conformer-transducer-mtl/blob/main/exp/conformer_transducer/avg_10.pt
+