add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/tedlium3/s0/local/prepare_data.sh
+++ b/examples/tedlium3/s0/local/prepare_data.sh
+#!/usr/bin/env bash
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# To be run from one directory above this script.
+
+. ./path.sh
+
+export LC_ALL=C
+
+sph2pipe=sph2pipe
+
+data_type=$1
+
+# Prepare: test, train,
+for set in dev test train; do
+  dir=data/$set.orig
+  mkdir -p $dir
+
+  # Merge transcripts into a single 'stm' file, do some mappings:
+  # - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test,
+  # - <F0_F> -> <o,f0,female> : --||--
+  # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary
+  # - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi)
+  # - (...) -> null : remove utterance names from end-lines of train
+  # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py)
+  { # Add STM header, so sclite can prepare the '.lur' file
+    echo ';;
+;; LABEL "o" "Overall" "Overall results"
+;; LABEL "f0" "f0" "Wideband channel"
+;; LABEL "f2" "f2" "Telephone channel"
+;; LABEL "male" "Male" "Male Talkers"
+;; LABEL "female" "Female" "Female Talkers"
+;;'
+    # Process the STMs
+    cat db/TEDLIUM_release-3/${data_type}/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \
+      sed -e 's:<F0_M>:<o,f0,male>:' \
+          -e 's:<F0_F>:<o,f0,female>:' \
+          -e 's:([0-9])::g' \
+          -e 's:<sil>::g' \
+          -e 's:([^ ]*)$::' | \
+      awk '{ $2 = "A"; print $0; }'
+  } | local/join_suffix.py > data/$set.orig/stm
+
+  # Prepare 'text' file
+  # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
+  cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
+    awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
+           for (i=7;i<=NF;i++) { printf(" %s", $i); }
+           printf("\n");
+         }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
+
+  # Prepare 'segments', 'utt2spk', 'spk2utt'
+  cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments
+  cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk
+  cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
+
+  # Prepare 'wav.scp', 'reco2file_and_channel'
+  cat $dir/spk2utt | awk -v data_type=$data_type -v set=$set -v pwd=$PWD '{ printf("%s %s/db/TEDLIUM_release-3/%s/%s/sph/%s.sph\n", $1, pwd, data_type, set, $1); }' > $dir/wav.scp
+  cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
+
+  # Create empty 'glm' file
+  echo ';; empty.glm
+  [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
+  ' > data/$set.orig/glm
+
+  # The training set seems to not have enough silence padding in the segmentations,
+  # especially at the beginning of segments.  Extend the times.
+  if [ $set == "train" ]; then
+    mv data/$set.orig/segments data/$set.orig/segments.temp
+    utils/data/extend_segment_times.py --start-padding=0.15 \
+      --end-padding=0.1 <data/$set.orig/segments.temp >data/$set.orig/segments || exit 1
+    rm data/$set.orig/segments.temp
+  fi
+
+  # Check that data dirs are okay!
+  utils/validate_data_dir.sh --no-feats $dir || exit 1
+done
+
--- a/examples/tedlium3/s0/path.sh
+++ b/examples/tedlium3/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/tedlium3/s0/run.sh
+++ b/examples/tedlium3/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+
+nj=16
+feat_dir=raw_wav
+
+data_type=raw # raw or shard
+num_utts_per_shard=1000
+
+data_cat=legacy
+
+train_set=train
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# bpemode (unigram or bpe)
+nbpe=500
+bpemode=unigram
+
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=10
+decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  local/download_data.sh # make soft link by yourself if you already have the dataset
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/prepare_data.sh $data_cat
+  for dset in dev test train; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 \
+      data/${dset}.orig data/${dset}
+  done
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # For wav feature, just copy the data. Fbank extraction is done in training
+  mkdir -p $feat_dir
+  for x in ${train_set} dev test; do
+    cp -r data/$x $feat_dir
+  done
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+    --in_scp data/${train_set}/wav.scp \
+    --out_cmvn $feat_dir/$train_set/global_cmvn
+
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char/
+
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+  tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} \
+    --model_type=${bpemode} --model_prefix=${bpemodel} \
+    --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model \
+    --output_format=piece < data/lang_char/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare data, prepare required format"
+  if [ ! -f $feat_dir/$train_set/segments ]; then
+    echo "$0: No such file segments" && exit 1;
+  else
+  for x in dev test ${train_set}; do
+    tools/make_raw_list.py --segments $feat_dir/$x/segments \
+    $feat_dir/$x/wav.scp $feat_dir/$x/text $feat_dir/$x/data.list
+  done
+  fi
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  # The number of gpus runing on each node/machine
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="nccl"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --bpe_model $bpemodel.model \
+      --train_data $feat_dir/$train_set/data.list \
+      --cv_data $feat_dir/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 8 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+  for mode in ${decode_modes}; do
+  {
+    test_dir=$dir/test_${mode}
+    mkdir -p $test_dir
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data $feat_dir/test/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --bpe_model $bpemodel.model \
+      --ctc_weight $ctc_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $test_dir/text \
+    ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    python tools/compute-wer.py --char=1 --v=1 \
+      $feat_dir/test/text $test_dir/text > $test_dir/wer
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
--- a/examples/tedlium3/s0/tools
+++ b/examples/tedlium3/s0/tools
+../../../tools/
\ No newline at end of file
--- a/examples/tedlium3/s0/wenet
+++ b/examples/tedlium3/s0/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs
+* Decoding info: average_num 20
+* trans_type: phn
+
+
+|     decoding mode      | test (wer) |
+| :--------------------: | :---------: |
+|   ctc_greedy_search    |   16.70%    |
+| ctc_prefix_beam_search |   16.60%    |
+|       attention        |   22.37%    |
+|  attention_rescoring   |   16.60%    |
+
+## transformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs
+* Decoding info: average_num 20
+* trans_type: phn
+
+
+|     decoding mode      | test (wer) |
+| :--------------------: | :---------: |
+|   ctc_greedy_search    |   17.78%    |
+| ctc_prefix_beam_search |   17.46%    |
+|       attention        |   21.77%    |
+|  attention_rescoring   |   17.06%    |
\ No newline at end of file
--- a/examples/timit/conf/train_conformer.yaml
+++ b/examples/timit/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4 #
+max_epoch: 120
+log_interval: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000 # 20000
--- a/examples/timit/conf/train_transformer.yaml
+++ b/examples/timit/conf/train_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.2
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.2
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 120
+log_interval: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
--- a/examples/timit/local/dev_spk.list
+++ b/examples/timit/local/dev_spk.list
+faks0
+fdac1
+fjem0
+mgwt0
+mjar0
+mmdb1
+mmdm2
+mpdf0
+fcmh0
+fkms0
+mbdg0
+mbwm0
+mcsh0
+fadg0
+fdms0
+fedw0
+mgjf0
+mglb0
+mrtk0
+mtaa0
+mtdt0
+mthc0
+mwjg0
+fnmr0
+frew0
+fsem0
+mbns0
+mmjr0
+mdls0
+mdlf0
+mdvc0
+mers0
+fmah0
+fdrw0
+mrcs0
+mrjm4
+fcal1
+mmwh0
+fjsj0
+majc0
+mjsw0
+mreb0
+fgjd0
+fjmg0
+mroa0
+mteb0
+mjfc0
+mrjr0
+fmml0
+mrws1
--- a/examples/timit/local/phones.60-48-39.map
+++ b/examples/timit/local/phones.60-48-39.map
+aa  aa  aa
+ae  ae  ae
+ah  ah  ah
+ao  ao  aa
+aw  aw  aw
+ax  ax  ah
+ax-h  ax  ah
+axr  er  er
+ay  ay  ay
+b  b  b
+bcl  vcl  sil
+ch  ch  ch
+d  d  d
+dcl  vcl  sil
+dh  dh  dh
+dx  dx  dx
+eh  eh  eh
+el  el  l
+em  m  m
+en  en  n
+eng  ng  ng
+epi  epi  sil
+er  er  er
+ey  ey  ey
+f  f  f
+g  g  g
+gcl  vcl  sil
+h#  sil  sil
+hh  hh  hh
+hv  hh  hh
+ih  ih  ih
+ix  ix  ih
+iy  iy  iy
+jh  jh  jh
+k  k  k
+kcl  cl  sil
+l  l  l
+m  m  m
+n  n  n
+ng  ng  ng
+nx  n  n
+ow  ow  ow
+oy  oy  oy
+p  p  p
+pau  sil  sil
+pcl  cl  sil
+q
+r  r  r
+s  s  s
+sh  sh  sh
+t  t  t
+tcl  cl  sil
+th  th  th
+uh  uh  uh
+uw  uw  uw
+ux  uw  uw
+v  v  v
+w  w  w
+y  y  y
+z  z  z
+zh  zh  sh
--- a/examples/timit/local/sph2pipe_process.py
+++ b/examples/timit/local/sph2pipe_process.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+
+
+def sph2pipe_wav(in_wav, tmp_out_wav, out_wav):
+    with open(in_wav, 'r', encoding='utf-8') as in_f:
+        with open(tmp_out_wav, 'w', encoding='utf-8') as tmp_out_f:
+            with open(out_wav, 'w', encoding='utf-8') as out_f:
+                for line in in_f:
+                    _tmp = line.strip().split(' ')
+                    wav_out_path = _tmp[4]
+                    wav_out_path = wav_out_path.split('/')
+                    wav_out_path[-4] = wav_out_path[-4] + '_pipe'
+                    if not os.path.exists('/'.join(wav_out_path[:-1])):
+                        os.makedirs('/'.join(wav_out_path[:-1]))
+                    wav_out_path = '/'.join(wav_out_path)
+                    tmp_out_f.write(' '.join(_tmp[1:5]) + ' ' + wav_out_path +
+                                    '\n')
+                    out_f.write(_tmp[0] + ' ' + wav_out_path + '\n')
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print('wrong input parameter')
+        raise NotImplementedError(len(sys.argv))
+    in_wav = sys.argv[1]
+    tmp_out_wav = sys.argv[2]
+    out_wav = sys.argv[3]
+    sph2pipe_wav(in_wav, tmp_out_wav, out_wav)
--- a/examples/timit/local/test_spk.list
+++ b/examples/timit/local/test_spk.list
+mdab0
+mwbt0
+felc0
+mtas1
+mwew0
+fpas0
+mjmp0
+mlnt0
+fpkt0
+mlll0
+mtls0
+fjlm0
+mbpm0
+mklt0
+fnlp0
+mcmj0
+mjdh0
+fmgd0
+mgrt0
+mnjm0
+fdhc0
+mjln0
+mpam0
+fmld0
--- a/examples/timit/local/timit_data_prep.sh
+++ b/examples/timit/local/timit_data_prep.sh
+#!/usr/bin/env bash
+
+# Copyright 2013   (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal)
+#           2014   Brno University of Technology (Author: Karel Vesely)
+#           2019   IIIT-Bangalore (Shreekantha Nadig)
+# Apache 2.0.
+
+
+create_glm_stm=false
+
+if [ $# -le 0 ]; then
+    echo "Argument should be the Timit directory, see ../run.sh for example."
+    exit 1;
+fi
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+conf=`pwd`/conf
+
+if [ $2 ]; then
+    if [[ $2 = "char" || $2 = "phn" ]]; then
+        trans_type=$2
+    else
+        echo "Transcript type must be one of [phn, char]"
+        echo $2
+    fi
+else
+    trans_type=phn
+fi
+
+. ./path.sh
+
+sph2pipe_version="v2.5"
+if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
+  echo "Download sph2pipe_${sph2pipe_version} ......"
+  wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
+  wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
+  tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
+  cd tools/sph2pipe_${sph2pipe_version}/ && \
+        gcc -o sph2pipe  *.c -lm
+  cd -
+fi
+sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
+
+if ! command -v "${sph2pipe}" &> /dev/null; then
+    echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+    exit 1;
+fi
+
+[ -f $local/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
+[ -f $local/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
+
+# First check if the train & test directories exist (these can either be upper-
+# or lower-cased
+if [ ! -d $1/TRAIN -o ! -d $1/TEST ] && [ ! -d $1/train -o ! -d $1/test ]; then
+    echo "timit_data_prep.sh: Spot check of command line argument failed"
+    echo "Command line argument must be absolute pathname to TIMIT directory"
+    echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT"
+    exit 1;
+fi
+
+# Now check what case the directory structure is
+uppercased=false
+train_dir=train
+test_dir=test
+if [ -d $1/TRAIN ]; then
+    uppercased=true
+    train_dir=TRAIN
+    test_dir=TEST
+fi
+
+tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
+trap 'rm -rf "$tmpdir"' EXIT
+
+# Get the list of speakers. The list of speakers in the 24-speaker core test
+# set and the 50-speaker development set must be supplied to the script. All
+# speakers in the 'train' directory are used for training.
+if $uppercased; then
+    tr '[:lower:]' '[:upper:]' < $local/dev_spk.list > $tmpdir/dev_spk
+    tr '[:lower:]' '[:upper:]' < $local/test_spk.list > $tmpdir/test_spk
+    ls -d "$1"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
+else
+    tr '[:upper:]' '[:lower:]' < $local/dev_spk.list > $tmpdir/dev_spk
+    tr '[:upper:]' '[:lower:]' < $local/test_spk.list > $tmpdir/test_spk
+    ls -d "$1"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
+fi
+
+cd $dir
+for x in train dev test; do
+    # First, find the list of audio files (use only si & sx utterances).
+    # Note: train & test sets are under different directories, but doing find on
+    # both and grepping for the speakers will work correctly.
+    find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
+    | grep -f $tmpdir/${x}_spk > ${x}_sph.flist
+
+    sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:i' ${x}_sph.flist \
+    > $tmpdir/${x}_sph.uttids
+    paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
+    | sort -k1,1 > ${x}_sph.scp
+
+    cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids
+
+    # Now, Convert the transcripts into our format (no normalization yet)
+    # Get the transcripts: each line of the output contains an utterance
+    # ID followed by the transcript.
+
+    if [ $trans_type = "phn" ]
+    then
+        echo "phone transcript!"
+        find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
+        | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
+        sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:i' $tmpdir/${x}_phn.flist \
+        > $tmpdir/${x}_phn.uttids
+        while read line; do
+            [ -f $line ] || error_exit "Cannot find transcription file '$line'";
+            cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;'
+        done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
+        paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
+        | sort -k1,1 > ${x}.trans
+
+    elif [ $trans_type = "char" ]
+    then
+        echo "char transcript!"
+        find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WRD' \
+        | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_wrd.flist
+        sed -e 's:.*/\(.*\)/\(.*\).WRD$:\1_\2:i' $tmpdir/${x}_wrd.flist \
+        > $tmpdir/${x}_wrd.uttids
+        while read line; do
+            [ -f $line ] || error_exit "Cannot find transcription file '$line'";
+            cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z  A-Z]//g'
+        done < $tmpdir/${x}_wrd.flist > $tmpdir/${x}_wrd.trans
+        paste $tmpdir/${x}_wrd.uttids $tmpdir/${x}_wrd.trans \
+        | sort -k1,1 > ${x}.trans
+    else
+        echo "WRONG!"
+        echo $trans_type
+        exit 0;
+    fi
+
+    # Do normalization steps.
+    cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $local/phones.60-48-39.map -to 39 | sort > $x.text || exit 1;
+    # cat ${x}.trans | sort > $x.text || exit 1;
+
+    # Create wav.scp
+    awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
+
+    # Make the utt2spk and spk2utt files.
+    cut -f1 -d'_'  $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk
+    cat $x.utt2spk | $local/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+
+    # Prepare gender mapping
+    cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; $g = lc($&); print "$_ $g\n";' > $x.spk2gender
+
+
+    if "${create_glm_stm}"; then
+        # Prepare STM file for sclite:
+        wav-to-duration --read-entire-file=true scp:${x}_wav.scp ark,t:${x}_dur.ark || exit 1
+        awk -v dur=${x}_dur.ark \
+        'BEGIN{
+         while(getline < dur) { durH[$1]=$2; }
+         print ";; LABEL \"O\" \"Overall\" \"Overall\"";
+         print ";; LABEL \"F\" \"Female\" \"Female speakers\"";
+         print ";; LABEL \"M\" \"Male\" \"Male speakers\"";
+       }
+       { wav=$1; spk=wav; sub(/_.*/,"",spk); $1=""; ref=$0;
+         gender=(substr(spk,0,1) == "f" ? "F" : "M");
+         printf("%s 1 %s 0.0 %f <O,%s> %s\n", wav, spk, durH[wav], gender, ref);
+       }
+        ' ${x}.text >${x}.stm || exit 1
+
+        # Create dummy GLM file for sclite:
+        echo ';; empty.glm
+      [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
+        ' > ${x}.glm
+    fi
+done
+
+echo "Data preparation succeeded"
\ No newline at end of file
--- a/examples/timit/local/timit_format_data.sh
+++ b/examples/timit/local/timit_format_data.sh
+#!/usr/bin/env bash
+
+# Copyright 2013  (Author: Daniel Povey)
+# Apache 2.0
+
+# This script takes data prepared in a corpus-dependent way
+# in data/local/, and converts it into the "canonical" form,
+# in various subdirectories of data/, e.g. data/lang, data/train, etc.
+
+. ./path.sh || exit 1;
+
+echo "Preparing train, dev and test data"
+srcdir=data/local/data
+
+
+for x in train dev test; do
+    mkdir -p data/$x
+    # cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
+    local/sph2pipe_process.py $srcdir/${x}_wav.scp data/${x}/tmp_wav.scp data/${x}/wav.scp || exit 1;
+    while read line
+    do
+      echo $line
+      $line
+    done < data/${x}/tmp_wav.scp
+    rm data/${x}/tmp_wav.scp
+
+    cp $srcdir/$x.text data/$x/text || exit 1;
+    cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
+    cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
+    tools/filter_scp.pl data/$x/spk2utt $srcdir/$x.spk2gender > data/$x/spk2gender || exit 1;
+    [ -e $srcdir/${x}.stm ] && cp $srcdir/${x}.stm data/$x/stm
+    [ -e $srcdir/${x}.glm ] && cp $srcdir/${x}.glm data/$x/glm
+    # tools/validate_data_dir.sh --no-feats data/$x || exit 1
+done
\ No newline at end of file
--- a/examples/timit/local/timit_norm_trans.pl
+++ b/examples/timit/local/timit_norm_trans.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+# Copyright 2012  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script normalizes the TIMIT phonetic transcripts that have been
+# extracted in a format where each line contains an utterance ID followed by
+# the transcript, e.g.:
+# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h#
+
+my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n
+Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a
+smaller set defined by the -m option. This script assumes that the mapping is
+done in the \"standard\" fashion, i.e. to 48 or 39 phones.  The input is
+assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can
+be changed using the -from option. The input format is assumed to be utterance
+ID followed by transcript on the same line.\n";
+
+use strict;
+use Getopt::Long;
+die "$usage" unless(@ARGV >= 1);
+my ($in_trans, $phone_map, $num_phones_out);
+my $num_phones_in = 60;
+GetOptions ("i=s" => \$in_trans,          # Input transcription
+      "m=s" => \$phone_map,         # File containing phone mappings
+      "from=i" => \$num_phones_in,  # Input #phones: must be 60 or 48
+      "to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39
+
+die $usage unless(defined($in_trans) && defined($phone_map) &&
+      defined($num_phones_out));
+if ($num_phones_in != 60 && $num_phones_in != 48) {
+  die "Can only used 60 or 48 for -from (used $num_phones_in)."
+}
+if ($num_phones_out != 48 && $num_phones_out != 39) {
+  die "Can only used 48 or 39 for -to (used $num_phones_out)."
+}
+unless ($num_phones_out < $num_phones_in) {
+  die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)."
+}
+
+
+open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!";
+my (%phonemap, %seen_phones);
+my $num_seen_phones = 0;
+while (<M>) {
+  chomp;
+  next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops.
+  m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_";
+  my $mapped_from = ($num_phones_in == 60)? $1 : $2;
+  my $mapped_to = ($num_phones_out == 48)? $2 : $3;
+  if (!defined($seen_phones{$mapped_to})) {
+    $seen_phones{$mapped_to} = 1;
+    $num_seen_phones += 1;
+  }
+  $phonemap{$mapped_from} = $mapped_to;
+}
+if ($num_seen_phones != $num_phones_out) {
+  die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones";
+}
+
+open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!";
+while (<T>) {
+  chomp;
+  $_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_";
+  my $utt_id = $1;
+  my $trans = $2;
+
+  $trans =~ s/q//g;  # Remove glottal stops.
+  $trans =~ s/^\s*//; $trans =~ s/\s*$//;  # Normalize spaces
+
+  print $utt_id;
+  for my $phone (split(/\s+/, $trans)) {
+    if(exists $phonemap{$phone}) { print " $phonemap{$phone}"; }
+    if(not exists $phonemap{$phone}) { print " $phone"; }
+  }
+  print "\n";
+}
--- a/examples/timit/local/utt2spk_to_spk2utt.pl
+++ b/examples/timit/local/utt2spk_to_spk2utt.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# converts an utt2spk file to a spk2utt file.
+# Takes input from the stdin or from a file argument;
+# output goes to the standard out.
+
+if ( @ARGV > 1 ) {
+    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
+}
+
+while(<>){
+    @A = split(" ", $_);
+    @A == 2 || die "Invalid line in utt2spk file: $_";
+    ($u,$s) = @A;
+    if(!$seen_spk{$s}) {
+        $seen_spk{$s} = 1;
+        push @spklist, $s;
+    }
+    push (@{$spk_hash{$s}}, "$u");
+}
+foreach $s (@spklist) {
+    $l = join(' ',@{$spk_hash{$s}});
+    print "$s $l\n";
+}
--- a/examples/timit/local/validate_data_dir.sh
+++ b/examples/timit/local/validate_data_dir.sh
+#!/usr/bin/env bash
+
+cmd="$@"
+
+no_feats=false
+no_wav=false
+no_text=false
+no_spk_sort=false
+
+for x in `seq 4`; do
+  if [ "$1" == "--no-feats" ]; then
+    no_feats=true
+    shift;
+  fi
+  if [ "$1" == "--no-text" ]; then
+    no_text=true
+    shift;
+  fi
+  if [ "$1" == "--no-wav" ]; then
+    no_wav=true
+    shift;
+  fi
+  if [ "$1" == "--no-spk-sort" ]; then
+    no_spk_sort=true
+    shift;
+  fi
+done
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
+  echo "The --no-xxx options mean that the script does not require "
+  echo "xxx.scp to be present, but it will check it if it is present."
+  echo "--no-spk-sort means that the script does not require the utt2spk to be "
+  echo "sorted by the speaker-id in addition to being sorted by utterance-id."
+  echo "By default, utt2spk is expected to be sorted by both, which can be "
+  echo "achieved by making the speaker-id prefixes of the utterance-ids"
+  echo "e.g.: $0 data/train"
+  exit 1;
+fi
+
+data=$1
+
+if [ ! -d $data ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -f $data/images.scp ]; then
+  cmd=${cmd/--no-wav/}  # remove --no-wav if supplied
+  image/validate_data_dir.sh $cmd
+  exit $?
+fi
+
+for f in spk2utt utt2spk; do
+  if [ ! -f $data/$f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+  if [ ! -s $data/$f ]; then
+    echo "$0: empty file $f"
+    exit 1;
+  fi
+done
+
+! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
+  echo "$0: $data/utt2spk has wrong format." && exit;
+
+ns=$(wc -l < $data/spk2utt)
+if [ "$ns" == 1 ]; then
+  echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
+  echo "   Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
+  echo "   for more information."
+fi
+
+
+tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
+trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
+
+export LC_ALL=C
+
+function check_sorted_and_uniq {
+  ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
+  ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
+    echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
+}
+
+function partial_diff {
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
+  n1=`cat $1 | wc -l`
+  n2=`cat $2 | wc -l`
+  echo "[Lengths are $1=$n1 versus $2=$n2]"
+}
+
+check_sorted_and_uniq $data/utt2spk
+
+if ! $no_spk_sort; then
+  ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
+     echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
+     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
+fi
+
+check_sorted_and_uniq $data/spk2utt
+
+! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
+     <(utils/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
+   echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
+
+cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
+
+if [ ! -f $data/text ] && ! $no_text; then
+  echo "$0: no such file $data/text (if this is by design, specify --no-text)"
+  exit 1;
+fi
+
+num_utts=`cat $tmpdir/utts | wc -l`
+if [ -f $data/text ]; then
+  utils/validate_text.pl $data/text || exit 1;
+  check_sorted_and_uniq $data/text
+  text_len=`cat $data/text | wc -l`
+  illegal_sym_list="<s> </s> #0"
+  for x in $illegal_sym_list; do
+    if grep -w "$x" $data/text > /dev/null; then
+      echo "$0: Error: in $data, text contains illegal symbol $x"
+      exit 1;
+    fi
+  done
+  awk '{print $1}' < $data/text > $tmpdir/utts.txt
+  if ! cmp -s $tmpdir/utts{,.txt}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.txt}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
+  echo "$0: in directory $data, segments file exists but no wav.scp"
+  exit 1;
+fi
+
+
+if [ ! -f $data/wav.scp ] && ! $no_wav; then
+  echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
+  exit 1;
+fi
+
+if [ -f $data/wav.scp ]; then
+  check_sorted_and_uniq $data/wav.scp
+
+  if grep -E -q '^\S+\s+~' $data/wav.scp; then
+    # note: it's not a good idea to have any kind of tilde in wav.scp, even if
+    # part of a command, as it would cause compatibility problems if run by
+    # other users, but this used to be not checked for so we let it slide unless
+    # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
+    # would definitely cause problems as the fopen system call does not do
+    # tilde expansion.
+    echo "$0: Please do not use tilde (~) in your wav.scp."
+    exit 1;
+  fi
+
+  if [ -f $data/segments ]; then
+
+    check_sorted_and_uniq $data/segments
+    # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
+    ! cat $data/segments | \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
+      echo "$0: badly formatted segments file" && exit 1;
+
+    segments_len=`cat $data/segments | wc -l`
+    if [ -f $data/text ]; then
+      ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
+        echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
+        echo "$0: Lengths are $segments_len vs $num_utts" && \
+        exit 1
+    fi
+
+    cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
+    awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
+    if ! cmp -s $tmpdir/recordings{,.wav}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.wav}
+      exit 1;
+    fi
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
+      if ! cmp -s $tmpdir/recordings{,.r2fc}; then
+        echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/recordings{,.r2fc}
+        exit 1;
+      fi
+    fi
+  else
+    # No segments file -> assume wav.scp indexed by utterance.
+    cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
+    if ! cmp -s $tmpdir/utts{,.wav}; then
+      echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/utts{,.wav}
+      exit 1;
+    fi
+
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
+      if ! cmp -s $tmpdir/utts{,.r2fc}; then
+        echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/utts{,.r2fc}
+        exit 1;
+      fi
+    fi
+  fi
+fi
+
+if [ ! -f $data/feats.scp ] && ! $no_feats; then
+  echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
+  exit 1;
+fi
+
+if [ -f $data/feats.scp ]; then
+  check_sorted_and_uniq $data/feats.scp
+  cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
+  if ! cmp -s $tmpdir/utts{,.feats}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.feats}
+    exit 1;
+  fi
+fi
+
+
+if [ -f $data/cmvn.scp ]; then
+  check_sorted_and_uniq $data/cmvn.scp
+  cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.cmvn}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.cmvn}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2gender ]; then
+  check_sorted_and_uniq $data/spk2gender
+  ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
+     echo "$0: Mal-formed spk2gender file" && exit 1;
+  cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2gender}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2warp ]; then
+  check_sorted_and_uniq $data/spk2warp
+  ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed spk2warp file" && exit 1;
+  cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2warp}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/utt2warp ]; then
+  check_sorted_and_uniq $data/utt2warp
+  ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed utt2warp file" && exit 1;
+  cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
+  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
+  if ! cmp -s $tmpdir/utts{,.utt2warp}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2warp}
+    exit 1;
+  fi
+fi
+
+# check some optionally-required things
+for f in vad.scp utt2lang utt2uniq; do
+  if [ -f $data/$f ]; then
+    check_sorted_and_uniq $data/$f
+    if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
+      <( awk '{print $1}' $data/$f ); then
+      echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
+      exit 1;
+    fi
+  fi
+done
+
+
+if [ -f $data/utt2dur ]; then
+  check_sorted_and_uniq $data/utt2dur
+  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
+  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2dur}
+    exit 1;
+  fi
+  cat $data/utt2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
+fi
+
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
+
+if [ -f $data/reco2dur ]; then
+  check_sorted_and_uniq $data/reco2dur
+  cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
+  if [ -f $tmpdir/recordings ]; then
+    if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.reco2dur}
+    exit 1;
+    fi
+  else
+    if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/{utts,recordings.reco2dur}
+    exit 1;
+    fi
+  fi
+  cat $data/reco2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+fi
+
+
+echo "$0: Successfully validated data-directory $data"
--- a/examples/timit/path.sh
+++ b/examples/timit/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/timit/run.sh
+++ b/examples/timit/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0     # start from 0 if you need to start from data preparation
+stop_stage=4
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+timit_data=/home/Liangcd/data/timit
+# path to save preproecssed data
+# export data=data
+
+
+nj=16
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_transformer.yaml
+cmvn=true
+dir=exp/transformer_phn_5k_acc4_bs16
+checkpoint=
+
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=20
+decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+# choose in [phn]
+trans_type=phn
+
+dict=data/dict/${trans_type}_units.txt
+
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+
+    echo "stage 0: Data preparation"
+    echo "preparing data for TIMIT for ${trans_type} level transcripts"
+    local/timit_data_prep.sh ${timit_data} ${trans_type} || exit 1;
+    local/timit_format_data.sh
+    echo "Finish stage 0"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: compute global cmvn"
+    # compute cmvn
+    tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+        --in_scp data/${train_set}/wav.scp \
+        --out_cmvn data/${train_set}/global_cmvn
+    echo "Finish stage 1"
+fi
+
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: make train dict"
+    # Make train dict
+    echo "Make a dictionary"
+    mkdir -p $(dirname $dict)
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+    tools/text2token.py -s 1 -n 1 --space sil --trans_type ${trans_type} data/${train_set}/text  \
+      | cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -v -e '^\s*$' | \
+      awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict # <eos>
+    echo "Finish stage 2"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Prepare data, prepare required format"
+  for x in dev test ${train_set}; do
+    if [ $data_type == "shard" ]; then
+      tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data/$x/wav.scp data/$x/text \
+        $(realpath data/$x/shards) data/$x/data.list
+    else
+      tools/make_raw_list.py data/$x/wav.scp data/$x/text \
+        data/$x/data.list
+    fi
+  done
+  echo "Finish stage 3"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  mkdir -p $dir
+  # You have to rm `INIT_FILE` manually when you resume or restart a
+  # multi-machine training.
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+
+  # train.py rewrite $train_config to $dir/train.yaml with model input
+  # and output dimension, and $dir/train.yaml will be used for inference
+  # and export.
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --train_data data/$train_set/data.list \
+      --cv_data data/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Please specify decoding_chunk_size for unified streaming and
+  # non-streaming model. The default value is -1, which is full chunk
+  # for non-streaming inference.
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+  for mode in ${decode_modes}; do
+  {
+    test_dir=$dir/test_${mode}
+    mkdir -p $test_dir
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data data/test/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --ctc_weight $ctc_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $test_dir/text \
+      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} \
+      --connect_symbol ▁
+    python tools/compute-wer.py --char=1 --v=1 \
+      data/test/text $test_dir/text > $test_dir/wer
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # compute wer
+  for mode in ${decode_modes}; do
+    for test_set in test; do
+     test_dir=$dir/test_${mode}
+     sed 's:▁: :g' $test_dir/text > $test_dir/text.norm
+     python tools/compute-wer.py --char=1 --v=1 \
+       data/$test_set/text $test_dir/text.norm > $test_dir/wer
+    done
+  done
+fi
+
--- a/examples/timit/tools
+++ b/examples/timit/tools
+../../../tools/
\ No newline at end of file