add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/timit/wenet
+++ b/examples/timit/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/vkw2021/s0/README.md
+++ b/examples/vkw2021/s0/README.md
+# conformer based end-to-end model for VKW challenge
+
+## Standard E2E Results
+
+Conformer without speed perpurb and lm
+* config: conf/train_train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
+* beam: 10
+* num of gpu: 8
+* num of averaged model: 5
+* ctc weight (used for attention rescoring): 0.5
+
+dev set results trained only with training set (785 keywords, 1505 hour train set)
+
+| scenario | Precision | Recall   | F1     | ATWV   |
+|----------|-----------|----------|--------|--------|
+| lgv      | 0.9281    | 0.6420   | 0.7590 | 0.5183 |
+| liv      | 0.8886    | 0.6515   | 0.7518 | 0.6050 |
+| stv      | 0.9120    | 0.7471   | 0.8213 | 0.6256 |
+
+dev set results trained with training set and finetune set (785 keywords, 1505 hour train set + 15 hour finetune set)
+
+| scenario | Precision | Recall   | F1     | ATWV   |
+|----------|-----------|----------|--------|--------|
+| lgv      | 0.9478    | 0.7311   | 0.8255 | 0.6352 |
+| liv      | 0.9177    | 0.8398   | 0.8770 | 0.7412 |
+| stv      | 0.9320    | 0.8207   | 0.8729 | 0.7120 |
+
+test set results trained only with training set (384 keywords, 1505 hour train set)
+
+| scenario | Precision | Recall   | F1     | ATWV   |
+|----------|-----------|----------|--------|--------|
+| lgv      | 0.6262    | 0.5648   | 0.5939 | 0.5825 |
+| liv      | 0.8797    | 0.6282   | 0.7330 | 0.6061 |
+| stv      | 0.9102    | 0.7221   | 0.8053 | 0.6682 |
+
+test set results trained with training set and finetune set (384 keywords, 1505 hour train set + 15 hour finetune set)
+
+| scenario | Precision | Recall   | F1     | ATWV   |
+|----------|-----------|----------|--------|--------|
+| lgv      | 0.6469    | 0.6276   | 0.6371 | 0.6116 |
+| liv      | 0.9278    | 0.7560   | 0.8331 | 0.6927 |
+| stv      | 0.9434    | 0.8061   | 0.8693 | 0.7275 |
--- a/examples/vkw2021/s0/conf/combine_finetune_5h_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
+++ b/examples/vkw2021/s0/conf/combine_finetune_5h_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: false
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 100
+log_interval: 400
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/vkw2021/s0/conf/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
+++ b/examples/vkw2021/s0/conf/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: false
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# use raw_wav or kaldi feature
+raw_wav: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 100
+log_interval: 400
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/vkw2021/s0/local/run_finetune_5h.sh
+++ b/examples/vkw2021/s0/local/run_finetune_5h.sh
+#!/bin/bash
+# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
+# Apach 2.0
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=-1
+stop_stage=0
+
+# The num of nodes
+num_nodes=1
+# The rank of current node
+node_rank=0
+
+# data
+data=data
+dict=data/dict/lang_char.txt
+data_type=raw # raw or shard
+
+train_set=train
+dev_set=combine_dev
+finetune2_set=combine_finetune_5h
+# Optional train_config
+name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char
+train_config=conf/${finetune2_set}_${name}.yaml
+cmvn=true
+dir=exp/${finetune2_set}_${name}_new
+checkpoint= #$dir/0.pt
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=10
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  # Data preparation
+  local/vkw_data_prep.sh
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  x=finetune_5h
+  for z in lgv liv stv; do
+    [ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \
+      mv data/vkw/label/lab_${z}/${x}/wav.scp \
+        data/vkw/label/lab_${z}/${x}/wav_ori.scp && \
+      cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \
+        > data/vkw/label/lab_${z}/${x}/wav.scp
+  done
+  y=`echo $x | cut -d "_" -f 1`
+  mkdir -p combine_${y}
+  for f in text wav.scp segments; do
+    for z in lgv liv stv; do
+      cat data/vkw/label/lab_${z}/${x}/$f
+    done > combine_${y}/$f
+  done
+  # remove the space between the text labels for Mandarin dataset
+  # download and transfer to wav.scp
+  cp data/${finetune2_set}/text data/${finetune2_set}/text.org
+  paste -d " " <(cut -f 1 -d" " data/${finetune2_set}/text.org) \
+    <(cut -f 2- -d" " data/${finetune2_set}/text.org | tr -d " ") \
+    > data/${finetune2_set}/text
+  rm data/${finetune2_set}/text.org
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "stage 1: generate segmented wav.scp and compute cmvn"
+  ## For wav feature, just copy the data. Fbank extraction is done in training
+  [ ! -f $data/$finetune2_set/segmentd_wav.scp ] && \
+    python tools/segment.py --segments $data/$finetune2_set/segments \
+      --input $data/$finetune2_set/wav.scp \
+      --output $data/$finetune2_set/segmented_wav.scp
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare data, prepare required format"
+  tools/make_raw_list.py --segments $data/$finetune2_set/segments \
+    $data/$finetune2_set/wav.scp $data/$finetune2_set/text $data/$finetune2_set/data.list
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  # The number of gpus runing on each node/machine
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp ${data}/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=$i ###`expr $node_rank \* $num_gpus + $i`
+    echo "start training"
+    [ ! -f exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt ] && \
+      echo "Please use a pretrained model for finetuning" && exit 0
+    [ ! -f $checkpoint ] && \
+      cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt $checkpoint && \
+      cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/0.yaml $dir/0.yaml
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data $data/${finetune2_set}/data.list \
+      --cv_data $data/${dev_set}/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 4 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    [ ! -f $decode_checkpoint ] && \
+    python3 wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Test model, please specify the model you want to use by --checkpoint
+  sets=${dev_set}
+  keywords_list=$data/vkw/keyword/kwlist
+  input_data=$feat_dir/${sets}/data.list
+  checkpoint=$dir/avg_${average_num}.pt
+  keyword_results=$dir/keyword_results_${sets}
+  ctc_results=$dir/ctc_results_${sets}
+  python3 local/vkw_kws_results.py --gpu 0 \
+    --config $dir/train.yaml \
+    --data_type $data_type \
+    --symbol_table $dict \
+    --num_workers 4 \
+    --prefetch 32 \
+    --input_data $input_data \
+    --checkpoint $checkpoint \
+    --keyword_unit_dict $keywords_list \
+    --keyword_results $keyword_results \
+    --ctc_results $ctc_results
+
+  [ ! -f scripts/bin/results_to_score.sh ] && \
+    ln -sf data/vkw/scripts scripts && chmod -R 755 scripts
+  ### attention: install the F4DE tool before testing
+  for y in "stv" "lgv" "liv"; do
+    mkdir -p $dir/dev_${y}
+    #[ ! -f data/vkw/score/dev_${y}/utter_map ] && \
+    if [ $y == "lgv" ]; then
+      grep "TV1" $keyword_results > $dir/dev_${y}/kws_results
+    elif [ $y == "liv" ]; then
+      grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results
+    elif [ $y == "stv" ]; then
+      grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results
+    else
+      "invalid $y"
+    fi
+    ./data/vkw/scripts/bin/results_to_score.sh \
+      data/vkw/score/dev_${y}/ecf \
+      data/vkw/label/lab_${y}/dev_5h/segments \
+      data/vkw/score/dev_${y}/utter_map \
+      $dir/dev_${y}/kws_results \
+      data/vkw/keyword/kwlist.xml \
+      data/vkw/score/dev_${y}/rttm
+    ./data/vkw/scripts/bin/F1.sh \
+      $dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv
+  done
+fi
--- a/examples/vkw2021/s0/local/vkw_data_prep.sh
+++ b/examples/vkw2021/s0/local/vkw_data_prep.sh
+#!/bin/bash
+# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
+# Apach 2.0
+
+current_dir=$(pwd)
+stage=0
+stop_stage=0
+. ./path.sh || exit 1;
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  cd $current_dir/data/
+  [ ! -z vkw_v1.1.zip ] && echo "wget vkw challenge data to this directory" && exit 0
+  [ ! -z vkw ] && unzip vkw_v1.1.zip
+  cd $current_dir
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  x=train
+  [ ! -f data/${x}/text ] && echo "vkw trainset is missing, wget to this directory" && exit 0
+fi
+
+echo "$0: vkw  data preparation succeeded"
--- a/examples/vkw2021/s0/local/vkw_kws_results.py
+++ b/examples/vkw2021/s0/local/vkw_kws_results.py
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#                    Tencent (Yougen Yuan)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import Dataset
+from wenet.transformer.asr_model import init_asr_model
+from wenet.utils.checkpoint import load_checkpoint
+
+from wenet.utils.common import get_subsample
+from wenet.utils.common import remove_duplicates_and_blank
+from wenet.utils.file_utils import read_symbol_table
+from wenet.utils.mask import make_pad_mask
+
+
+def map_words2char(word_list_file):
+    word_unit_dict = {}
+    word_id_dict = {}
+    for line in open(word_list_file, mode="r", encoding="utf8"):
+        ids, keyword = line.split("\n")[0].split()
+        keyword_char = []
+        for i in keyword:
+            keyword_char.append(i)
+        word_unit_dict[keyword] = keyword_char
+        word_id_dict[keyword] = ids
+    return word_id_dict, word_unit_dict
+
+
+def get_frames_timestamp(alignment):
+    # convert alignment to a praat format, which is a doing phonetics
+    # by computer and helps analyzing alignment
+    timestamp = []
+    # get frames level duration for each token
+    start = 0
+    end = 0
+    while end < len(alignment):
+        while end < len(alignment) and alignment[end] == 0:
+            end += 1
+
+        if end == len(alignment) and start < end:
+            if start == 0:
+                timestamp.append(alignment[start:])
+            else:
+                timestamp[-1] += alignment[start:]
+            break
+
+        end += 1
+        while end < len(alignment) and alignment[end - 1] == alignment[end]:
+            end += 1
+
+        timestamp.append(alignment[start:end])
+        start = end
+    return timestamp
+
+
+def get_labformat_frames(timestamp, subsample, char_dict):
+    begin = 0
+    duration = 0
+    word_seq = []
+    word_time = []
+    for idx, t in enumerate(timestamp):
+        duration = len(t) * subsample
+        if idx < len(timestamp) - 1:
+            word_seq.append(char_dict[t[-1]])
+            word_time.append([begin, begin + duration])
+        else:
+            non_blank = 0
+            token = 0
+            for i in t:
+                if i != 0:
+                    token = i
+                    break
+            word_seq.append(char_dict[token])
+            word_time.append([begin, begin + duration])
+        begin = begin + duration
+    return word_seq, word_time
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--data_type',
+                        default='raw',
+                        choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--input_data', required=True, help='cv data file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this local rank, -1 for cpu')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--ddp.rank',
+                        dest='rank',
+                        default=0,
+                        type=int,
+                        help='global rank for distributed training')
+    parser.add_argument('--ddp.world_size',
+                        dest='world_size',
+                        default=-1,
+                        type=int,
+                        help='''number of total processes/gpus for
+                        distributed training''')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--ddp.init_method',
+                        dest='init_method',
+                        default=None,
+                        help='ddp init method')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--symbol_table',
+                        required=True,
+                        help='model unit symbol table for training')
+    parser.add_argument('--keyword_unit_dict',
+                        required=True,
+                        help='keyword id')
+    parser.add_argument('--keyword_results',
+                        required=True,
+                        help='keyword results')
+    parser.add_argument('--ctc_results', required=True, help='ctc results')
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    # Set random seed
+    torch.manual_seed(777)
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    distributed = args.world_size > 1
+    if distributed:
+        logging.info('training on multiple gpus, this gpu {}'.format(args.gpu))
+        dist.init_process_group(args.dist_backend,
+                                init_method=args.init_method,
+                                world_size=args.world_size,
+                                rank=args.rank)
+
+    symbol_table = read_symbol_table(args.symbol_table)
+    # Load dict
+    char_dict = {}
+    with open(args.symbol_table, mode='r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    train_conf = configs['dataset_conf']
+    cv_conf = copy.deepcopy(train_conf)
+    cv_conf['speed_perturb'] = False
+    cv_conf['spec_aug'] = False
+
+    cv_dataset = Dataset(args.data_type,
+                         args.input_data,
+                         symbol_table,
+                         cv_conf,
+                         None,
+                         partition=False)
+
+    cv_data_loader = DataLoader(cv_dataset,
+                                batch_size=None,
+                                pin_memory=args.pin_memory,
+                                num_workers=args.num_workers,
+                                prefetch_factor=args.prefetch)
+
+    print("Reading: ", args.keyword_unit_dict)
+    word_id_dict, word_unit_dict = map_words2char(args.keyword_unit_dict)
+    word_unit_list = list(word_unit_dict.keys())
+    print("word_unit_list has the size of %d" % (len(word_unit_list)))
+
+    # Init asr model from configs
+    model = init_asr_model(configs)
+    load_checkpoint(model, args.checkpoint)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    model = model.to(device)
+
+    model.eval()
+    f_keyword_results = open(args.keyword_results, 'w', encoding='utf-8')
+    f_ctc_results = open(args.ctc_results, 'w', encoding='utf-8')
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(cv_data_loader):
+            key, feat, target, feats_length, target_length = batch
+            feat = feat.to(device)
+            target = target.to(device)
+            feats_length = feats_length.to(device)
+            target_length = target_length.to(device)
+            # Let's assume B = batch_size and N = beam_size
+            # 1. Encoder
+            encoder_out, encoder_mask = model._forward_encoder(
+                feat, feats_length)  # (B, maxlen, encoder_dim)
+            maxlen = encoder_out.size(1)
+            batch_size = encoder_out.size(0)
+            ctc_probs = model.ctc.log_softmax(
+                encoder_out)  # (1, maxlen, vocab_size)
+            encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+            topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
+            topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+            mask = make_pad_mask(encoder_out_lens)  # (B, maxlen)
+            topk_index = topk_index.masked_fill_(mask, eos)  # (B, maxlen)
+            alignment = [hyp.tolist() for hyp in topk_index]
+            hyps = [remove_duplicates_and_blank(hyp) for hyp in alignment]
+            for index, i in enumerate(key):
+                content = []
+                if len(hyps[index]) > 0:
+                    for w in hyps[index]:
+                        if w == eos:
+                            break
+                        content.append(char_dict[w])
+                f_ctc_results.write('{} {}\n'.format(i, " ".join(content)))
+            f_ctc_results.flush()
+            for index, i in enumerate(key):
+                timestamp = get_frames_timestamp(alignment[index])
+                subsample = get_subsample(configs)
+                word_seq, word_time = get_labformat_frames(
+                    timestamp, subsample, char_dict)
+                for index_j in range(len(word_seq)):
+                    for keyword in word_unit_list:
+                        keyword_len = len(word_unit_dict[keyword])
+                        if index_j + keyword_len > len(word_seq):
+                            continue
+                        if (word_seq[index_j:index_j +
+                                     keyword_len] == word_unit_dict[keyword]):
+                            f_keyword_results.write("{} {} {} {} {}\n".format(
+                                word_id_dict[keyword], i,
+                                word_time[index_j][0],
+                                word_time[index_j + keyword_len - 1][1], 0.0))
+            f_keyword_results.flush()
+    f_keyword_results.close()
+    f_ctc_results.close()
--- a/examples/vkw2021/s0/path.sh
+++ b/examples/vkw2021/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/vkw2021/s0/run.sh
+++ b/examples/vkw2021/s0/run.sh
+#!/bin/bash
+# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
+# Apach 2.0
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=-1
+stop_stage=0
+
+# The num of nodes
+num_nodes=1
+# The rank of current node
+node_rank=0
+
+# data
+data=data
+dict=data/dict/lang_char.txt
+data_type=raw # raw or shard
+
+train_set=train
+dev_set=combine_dev
+# Optional train_config
+name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char
+train_config=conf/train_${name}.yaml
+cmvn=true
+dir=exp/train_${name}_new
+checkpoint= #$dir/0.pt
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=10
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  # Data preparation
+  local/vkw_data_prep.sh
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  x=dev_5h
+  for z in lgv liv stv; do
+    [ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \
+      mv data/vkw/label/lab_${z}/${x}/wav.scp \
+        data/vkw/label/lab_${z}/${x}/wav_ori.scp && \
+      cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \
+        > data/vkw/label/lab_${z}/${x}/wav.scp
+  done
+  y=`echo $x | cut -d "_" -f 1`
+  mkdir -p combine_${y}
+  for f in text wav.scp segments; do
+    for z in lgv liv stv; do
+      cat data/vkw/label/lab_${z}/${x}/$f
+    done > combine_${y}/$f
+  done
+  # remove the space between the text labels for Mandarin dataset
+  # download and transfer to wav.scp
+  for x in ${dev_set} ${train_set}; do
+    cp data/${x}/text data/${x}/text.org
+    paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " \
+      data/${x}/text.org | tr -d " ") > data/${x}/text
+    rm data/${x}/text.org
+  done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "stage 1: generate segmented wav.scp and compute cmvn"
+  ## For wav feature, just copy the data. Fbank extraction is done in training
+  for x in ${dev_set} ${train_set}; do
+    [ ! -f $data/$x/segmentd_wav.scp ] && \
+      python tools/segment.py --segments $data/$x/segments \
+        --input $data/$x/wav.scp \
+        --output $data/$x/segmented_wav.scp
+  done
+
+  ### generate global_cmvn using training set
+  tools/compute_cmvn_stats.py --num_workers 12 --train_config $train_config \
+    --in_scp $data/${train_set}/segmented_wav.scp \
+    --out_cmvn $data/$train_set/global_cmvn
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # Make train dict
+  echo "Make a dictionary"
+  mkdir -p $(dirname $dict)
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  tools/text2token.py -s 1 -n 1 $data/${train_set}/text | cut -f 2- -d" " | \
+    tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | grep -P '[\p{Han}]'\
+    | awk '{print $0 " " NR+1}' >> ${dict}
+
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare data, prepare required format"
+  for x in ${dev_set} ${train_set}; do
+    tools/make_raw_list.py --segments $data/$x/segments \
+      $data/$x/wav.scp $data/$x/text $data/$x/data.list
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  # The number of gpus runing on each node/machine
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp ${data}/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=$i ###`expr $node_rank \* $num_gpus + $i`
+    echo "start training"
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data $data/$train_set/data.list \
+      --cv_data $data/${dev_set}/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 4 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    [ ! -f $decode_checkpoint ] && \
+    python3 wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Test model, please specify the model you want to use by --checkpoint
+  sets=${dev_set}
+  keywords_list=$data/vkw/keyword/kwlist
+  input_data=$feat_dir/${sets}/data.list
+  checkpoint=$dir/avg_${average_num}.pt
+  keyword_results=$dir/keyword_results_${sets}
+  ctc_results=$dir/ctc_results_${sets}
+  python3 local/vkw_kws_results.py --gpu 0 \
+    --config $dir/train.yaml \
+    --data_type $data_type \
+    --symbol_table $dict \
+    --num_workers 4 \
+    --prefetch 32 \
+    --input_data $input_data \
+    --checkpoint $checkpoint \
+    --keyword_unit_dict $keywords_list \
+    --keyword_results $keyword_results \
+    --ctc_results $ctc_results
+
+  [ ! -f scripts/bin/results_to_score.sh ] && \
+    ln -sf data/vkw/scripts scripts && chmod -R 755 scripts
+  ### attention: install the F4DE tool before testing
+  for y in "stv" "lgv" "liv"; do
+    mkdir -p $dir/dev_${y}
+    #[ ! -f data/vkw/score/dev_${y}/utter_map ] && \
+    if [ $y == "lgv" ]; then
+      grep "TV1" $keyword_results > $dir/dev_${y}/kws_results
+    elif [ $y == "liv" ]; then
+      grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results
+    elif [ $y == "stv" ]; then
+      grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results
+    else
+      "invalid $y"
+    fi
+    ./data/vkw/scripts/bin/results_to_score.sh \
+      data/vkw/score/dev_${y}/ecf \
+      data/vkw/label/lab_${y}/dev_5h/segments \
+      data/vkw/score/dev_${y}/utter_map \
+      $dir/dev_${y}/kws_results \
+      data/vkw/keyword/kwlist.xml \
+      data/vkw/score/dev_${y}/rttm
+    ./data/vkw/scripts/bin/F1.sh \
+      $dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv
+  done
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "adding 5h finetune data for each scenario to obtain better results"
+    local/run_finetune_5h.sh
+fi
--- a/examples/vkw2021/s0/tools
+++ b/examples/vkw2021/s0/tools
+../../../tools/
\ No newline at end of file
--- a/examples/vkw2021/s0/wenet
+++ b/examples/vkw2021/s0/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/wenetspeech/s0/README.md
+++ b/examples/wenetspeech/s0/README.md
+# Performance Record
+
+## Conformer
+
+* Feature info: using fbank feature, with dither 1.0, with cmvn
+* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs
+* Decoding info: ctc_weight 0.5, average_num 10
+
+|   decoding_method   |  Dev | Test\_Net | Test\_Meeting |
+|:-------------------:|:----:|:---------:|:-------------:|
+|  ctc_greedy_search  | 8.88 |   10.29   |     15.96     |
+|      attention      | 9.38 |   10.12   |     17.28     |
+| attention_rescoring | 8.69 |    9.7    |     15.59     |
+
+## Conformer bidecoder
+
+* Feature info: using fbank feature, with dither 1.0, with cmvn
+* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs
+* Decoding info: ctc_weight 0.5, average_num 10
+
+|   decoding_method   |  Dev | Test\_Net | Test\_Meeting |
+|:-------------------:|:----:|:---------:|:-------------:|
+|  ctc_greedy_search  | 8.98 |    9.55   |     16.48     |
+|      attention      | 9.42 |   10.57   |     18.05     |
+| attention_rescoring | 8.85 |    9.25   |     16.18     |
+
+## U2++ conformer
+
+* Feature info: using fbank feature, with dither 1.0, with cmvn
+* Training info: lr 0.001, batch size 48, 8 gpus on A100, acc_grad 16, 50 epochs
+* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 10
+
+| Decoding mode - Chunk size    | Dev  | Test\_Net | Test\_Meeting |
+|:-----------------------------:|:----:|:---------:|:-------------:|
+| ctc greedy search - full      | 8.85 | 9.78      | 17.77         |
+| ctc greedy search - 16        | 9.32 | 11.02     | 18.79         |
+| ctc prefix beam search - full | 8.80 | 9.73      | 17.57         |
+| ctc prefix beam search - 16   | 9.25 | 10.96     | 18.62         |
+| attention rescoring - full    | 8.60 | 9.26      | 17.34         |
+| attention rescoring - 16      | 8.87 | 10.22     | 18.11         |
--- a/examples/wenetspeech/s0/conf/train_conformer.yaml
+++ b/examples/wenetspeech/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    cnn_module_norm: 'layer_norm'
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 1200
+        min_length: 10
+        token_max_length: 100
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 1000  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 32
+
+grad_clip: 5
+accum_grad: 16
+max_epoch: 26
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
--- a/examples/wenetspeech/s0/conf/train_conformer_bidecoder.yaml
+++ b/examples/wenetspeech/s0/conf/train_conformer_bidecoder.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    cnn_module_norm: 'layer_norm'
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+dataset_conf:
+    filter_conf:
+        max_length: 1200
+        min_length: 10
+        token_max_length: 100
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 1000  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 32
+
+grad_clip: 5
+accum_grad: 16
+max_epoch: 26
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
--- a/examples/wenetspeech/s0/local/extract_meta.py
+++ b/examples/wenetspeech/s0/local/extract_meta.py
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import argparse
+import json
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+      This script is used to process raw json dataset of WenetSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """)
+    parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
+    parser.add_argument('output_dir', help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    try:
+        with open(input_json, 'r') as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f'Failed to load input json file: {input_json}')
+    else:
+        if json_data['audios'] is not None:
+            with open(f'{output_dir}/text', 'w') as utt2text, \
+                 open(f'{output_dir}/segments', 'w') as segments, \
+                 open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
+                 open(f'{output_dir}/wav.scp', 'w') as wavscp, \
+                 open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
+                 open(f'{output_dir}/reco2dur', 'w') as reco2dur:
+                for long_audio in json_data['audios']:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio['path']))
+                        aid = long_audio['aid']
+                        segments_lists = long_audio['segments']
+                        duration = long_audio['duration']
+                        assert (os.path.exists(long_audio_path))
+                    except AssertionError:
+                        print(f'''Warning: {aid} something is wrong,
+                                  maybe AssertionError, skipped''')
+                        continue
+                    except Exception:
+                        print(f'''Warning: {aid} something is wrong, maybe the
+                                  error path: {long_audio_path}, skipped''')
+                        continue
+                    else:
+                        wavscp.write(f'{aid}\t{long_audio_path}\n')
+                        reco2dur.write(f'{aid}\t{duration}\n')
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file['sid']
+                                start_time = segment_file['begin_time']
+                                end_time = segment_file['end_time']
+                                dur = end_time - start_time
+                                text = segment_file['text']
+                                segment_subsets = segment_file["subsets"]
+                            except Exception:
+                                print(f'''Warning: {segment_file} something
+                                          is wrong, skipped''')
+                                continue
+                            else:
+                                utt2text.write(f'{sid}\t{text}\n')
+                                segments.write(
+                                    f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
+                                )
+                                utt2dur.write(f'{sid}\t{dur}\n')
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(
+                                    f'{sid}\t{segment_sub_names}\n')
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/wenetspeech/s0/local/process_opus.py
+++ b/examples/wenetspeech/s0/local/process_opus.py
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# process_opus.py: segmentation and downsampling of opus audio
+
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+
+from pydub import AudioSegment
+import sys
+import os
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list):
+    num_utts = len(utt_list)
+    step = int(num_utts * 0.01)
+    with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)) \
+                .replace("audio", 'audio_seg')
+            seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
+
+            # if not os.path.exists(output_dir):
+            #     os.makedirs(output_dir)
+
+            if current_wav_path != previous_wav_path:
+                source_wav = AudioSegment.from_file(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * 1000)
+            end = int(end_time_list[i] * 1000)
+            target_audio = source_wav[start:end].set_frame_rate(16000) \
+                .set_sample_width(2)
+            target_audio.export(seg_wav_path, format="wav")
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % step == 0:
+                print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list \
+        = read_file(wav_scp, segments)
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/wenetspeech/s0/local/wenetspeech_data_prep.sh
+++ b/examples/wenetspeech/s0/local/wenetspeech_data_prep.sh
+#!/usr/bin/env bash
+
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+  echo ""
+  echo "This script takes the WenetSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
+  exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [W]="train_w"
+  [DEV]="dev"
+  [TEST_NET]="test_net"
+  [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+  [ ! -d $wenetspeech_dir/audio ] &&\
+    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Split data to train, dev, test_net, and test_meeting"
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST_NET TEST_MEETING; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    cat $corpus_dir/utt2subsets | \
+       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
--- a/examples/wenetspeech/s0/path.sh
+++ b/examples/wenetspeech/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/wenetspeech/s0/run.sh
+++ b/examples/wenetspeech/s0/run.sh
+#!/bin/bash
+
+# Copyright 2021  Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+stage=0
+stop_stage=5
+
+# The num of nodes
+num_nodes=1
+# The rank of current node
+node_rank=0
+
+# Use your own data path. You need to download the WenetSpeech dataset by yourself.
+wenetspeech_data_dir=/ssd/nfs07/binbinzhang/wenetspeech
+# Make sure you have 1.2T for ${shards_dir}
+shards_dir=/ssd/nfs06/unified_data/wenetspeech_shards
+
+# WenetSpeech training set
+set=L
+train_set=train_`echo $set | tr 'A-Z' 'a-z'`
+dev_set=dev
+test_sets="test_net test_meeting"
+
+train_config=conf/train_conformer.yaml
+checkpoint=
+cmvn=true
+cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
+dir=exp/conformer
+
+decode_checkpoint=
+average_checkpoint=true
+average_num=10
+decode_modes="attention_rescoring ctc_greedy_search"
+
+. tools/parse_options.sh || exit 1;
+
+set -u
+set -o pipefail
+
+# Data download
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
+    exit 0;
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  echo "Data preparation"
+  local/wenetspeech_data_prep.sh \
+    --train-subset $set \
+    $wenetspeech_data_dir \
+    data || exit 1;
+fi
+
+dict=data/dict/lang_char.txt
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Make a dictionary"
+    echo "dictionary: ${dict}"
+    mkdir -p $(dirname $dict)
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+    echo "▁ 2" >> ${dict} # ▁ is for space
+    tools/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \
+        | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' \
+        | grep -v "▁" \
+        | awk '{print $0 " " NR+2}' >> ${dict} \
+        || exit 1;
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "Compute cmvn"
+  # Here we use all the training data, you can sample some some data to save time
+  # BUG!!! We should use the segmented data for CMVN
+  if $cmvn; then
+    full_size=`cat data/${train_set}/wav.scp | wc -l`
+    sampling_size=$((full_size / cmvn_sampling_divisor))
+    shuf -n $sampling_size data/$train_set/wav.scp \
+      > data/$train_set/wav.scp.sampled
+    python3 tools/compute_cmvn_stats.py \
+    --num_workers 16 \
+    --train_config $train_config \
+    --in_scp data/$train_set/wav.scp.sampled \
+    --out_cmvn data/$train_set/global_cmvn \
+    || exit 1;
+  fi
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Making shards, please wait..."
+  RED='\033[0;31m'
+  NOCOLOR='\033[0m'
+  echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
+  echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
+  for x in $dev_set $test_sets ${train_set}; do
+    dst=$shards_dir/$x
+    mkdir -p $dst
+    tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
+      --num_threads 32 --segments data/$x/segments \
+      data/$x/wav.scp data/$x/text \
+      $(realpath $dst) data/$x/data.list
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "Start training"
+  mkdir -p $dir
+  # INIT_FILE is for DDP synchronization
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="nccl"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type "shard" \
+      --symbol_table $dict \
+      --train_data data/$train_set/data.list \
+      --cv_data data/$dev_set/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      $cmvn_opts \
+      --num_workers 8 \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "Test model"
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+        --dst_model $decode_checkpoint \
+        --src_path $dir  \
+        --num ${average_num} \
+        --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+  for testset in ${test_sets} ${dev_set}; do
+  {
+    for mode in ${decode_modes}; do
+    {
+      base=$(basename $decode_checkpoint)
+      result_dir=$dir/${testset}_${mode}_${base}
+      mkdir -p $result_dir
+      python wenet/bin/recognize.py --gpu 0 \
+        --mode $mode \
+        --config $dir/train.yaml \
+        --data_type "shard" \
+        --test_data data/$testset/data.list \
+        --checkpoint $decode_checkpoint \
+        --beam_size 10 \
+        --batch_size 1 \
+        --penalty 0.0 \
+        --dict $dict \
+        --ctc_weight $ctc_weight \
+        --reverse_weight $reverse_weight \
+        --result_file $result_dir/text \
+        ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+      python tools/compute-wer.py --char=1 --v=1 \
+        data/$testset/text $result_dir/text > $result_dir/wer
+    }
+    done
+    wait
+  }
+  done
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  echo "Export the best model you want"
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
--- a/examples/wenetspeech/s0/tools
+++ b/examples/wenetspeech/s0/tools
+../../../tools/
\ No newline at end of file