add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/aishell4/s0/local/copy_data_dir.sh
+++ b/examples/aishell4/s0/local/copy_data_dir.sh
+#!/usr/bin/env bash
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  feats.scp
+#  wav.scp
+#  vad.scp
+#  spk2utt
+#  utt2spk
+#  text
+#
+# It copies to another directory, possibly adding a specified prefix or a suffix
+# to the utterance and/or speaker names.  Note, the recording-ids stay the same.
+#
+
+
+# begin configuration section
+spk_prefix=
+utt_prefix=
+spk_suffix=
+utt_suffix=
+validate_opts=   # should rarely be needed.
+# end configuration section
+
+. local/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
+  echo "Options"
+  echo "   --spk-prefix=<prefix>     # Prefix for speaker ids, default empty"
+  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
+  echo "   --spk-suffix=<suffix>     # Suffix for speaker ids, default empty"
+  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
+  exit 1;
+fi
+
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
+
+if [ ! -f $srcdir/utt2uniq ]; then
+  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
+    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
+  fi
+else
+  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
+fi
+
+cat $srcdir/utt2spk | local/apply_map.pl -f 1 $destdir/utt_map  | \
+  local/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+local/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+if [ -f $srcdir/feats.scp ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
+fi
+
+if [ -f $srcdir/vad.scp ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
+fi
+
+if [ -f $srcdir/segments ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
+  cp $srcdir/wav.scp $destdir
+else # no segments->wav indexed by utt.
+  if [ -f $srcdir/wav.scp ]; then
+    local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
+  fi
+fi
+
+if [ -f $srcdir/reco2file_and_channel ]; then
+  cp $srcdir/reco2file_and_channel $destdir/
+fi
+
+if [ -f $srcdir/text ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/text.tc ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.tc >$destdir/text.tc
+fi
+if [ -f $srcdir/text.lc ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc >$destdir/text.lc
+fi
+if [ -f $srcdir/text.lc.rm ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc.rm >$destdir/text.lc.rm
+fi
+if [ -f $srcdir/utt2dur ]; then
+  local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
+if [ -f $srcdir/reco2dur ]; then
+  if [ -f $srcdir/segments ]; then
+    cp $srcdir/reco2dur $destdir/reco2dur
+  else
+    local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
+  fi
+fi
+if [ -f $srcdir/spk2gender ]; then
+  local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+if [ -f $srcdir/cmvn.scp ]; then
+  local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
+fi
+for f in stm glm ctm; do
+  if [ -f $srcdir/$f ]; then
+    cp $srcdir/$f $destdir
+  fi
+done
+
+rm $destdir/spk_map $destdir/utt_map
+
+echo "$0: copied data from $srcdir to $destdir"
+
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
+  if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
+    echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
+    echo " ... $destdir/.backup/$f"
+    mkdir -p $destdir/.backup
+    mv $destdir/$f $destdir/.backup/
+  fi
+done
+
+
+[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
+[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
+
+local/validate_data_dir.sh $validate_opts $destdir
--- a/examples/aishell4/s0/local/download_and_untar.sh
+++ b/examples/aishell4/s0/local/download_and_untar.sh
+#!/bin/bash
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /home/data/aishell4 https://www.openslr.org/resources/111 train_L"
+  echo "<corpus-part> can be one of: train_L, train_M, train_S, test."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="train_L train_M train_S test"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+if [ -f $data/$part.tar.gz ]; then
+  echo "$0: removing existing file $data/$part.tar.gz"
+  rm $data/$part.tar.gz
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
--- a/examples/aishell4/s0/local/filter_scp.pl
+++ b/examples/aishell4/s0/local/filter_scp.pl
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/examples/aishell4/s0/local/prepare_data.sh
+++ b/examples/aishell4/s0/local/prepare_data.sh
+#!/bin/bash
+
+. ./path.sh || exit 1;
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <audio-path>"
+  echo " $0 /home/data/aishell4"
+  exit 1;
+fi
+
+aishell4_source_dir=$1
+train_dir=data/local/aishell4_train
+test_dir=data/local/aishell4_test
+
+mkdir -p $train_dir
+mkdir -p $test_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+for room_name in "train_L" "train_M" "train_S" "test"; do
+  if [ -f ${aishell4_source_dir}/$room_name/wav_list.txt ];then
+    rm  ${aishell4_source_dir}/$room_name/wav_list.txt
+  fi
+  FILES="$PWD/${aishell4_source_dir}/$room_name/wav/*"
+  for f in $FILES; do
+    echo "$f" >> ${aishell4_source_dir}/$room_name/wav_list.txt
+  done
+  if [ -f ${aishell4_source_dir}/$room_name/TextGrid_list.txt ];then
+    rm ${aishell4_source_dir}/$room_name/TextGrid_list.txt
+  fi
+  FILES="$PWD/${aishell4_source_dir}/$room_name/TextGrid/*.TextGrid"
+  for f in $FILES; do
+    echo "$f" >> ${aishell4_source_dir}/$room_name/TextGrid_list.txt
+  done
+done
+
+mkdir -p ${aishell4_source_dir}/full_train
+for r in train_L train_M train_S ; do
+  cat ${aishell4_source_dir}/$r/TextGrid_list.txt >> ${aishell4_source_dir}/full_train/textgrid.flist
+  cat ${aishell4_source_dir}/$r/wav_list.txt >> ${aishell4_source_dir}/full_train/wav.flist
+done
+
+wav_list_aishell4=${aishell4_source_dir}/full_train/wav.flist
+text_grid_aishell4=${aishell4_source_dir}/full_train/textgrid.flist
+
+# process train set
+sed -e 's/\.wav//' $train_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list
+paste -d' ' $train_dir/utt.list $train_dir/wav.flist | sort -u > $train_dir/wav.scp
+python local/aishell4_process_textgrid.py --path $train_dir
+cat $train_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $train_dir/text
+local/filter_scp.pl -f 1 $train_dir/text $train_dir/utt2spk_all | sort -u > $train_dir/utt2spk
+local/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt
+local/filter_scp.pl -f 1 $train_dir/text $train_dir/segments_all | sort -u > $train_dir/segments
+
+# process test set
+sed -e 's/\.wav//' $test_dir/wav.flist | awk -F '/' '{print $NF}' > $test_dir/utt.list
+paste -d' ' $test_dir/utt.list $test_dir/wav.flist |sort -u > $test_dir/wav.scp
+python local/aishell4_process_textgrid.py --path $test_dir
+cat $test_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $test_dir/text
+local/filter_scp.pl -f 1 $test_dir/text $test_dir/utt2spk_all | sort -u > $test_dir/utt2spk
+local/utt2spk_to_spk2utt.pl $test_dir/utt2spk > $test_dir/spk2utt
+local/filter_scp.pl -f 1 $test_dir/text $test_dir/segments_all | sort -u > $test_dir/segments
+
+local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \
+  $train_dir data/aishell4_train
+local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \
+  $test_dir data/aishell4_test
+
+echo "$0: AISHELL4 data preparation succeeded"
+exit 0;
--- a/examples/aishell4/s0/local/spk2utt_to_utt2spk.pl
+++ b/examples/aishell4/s0/local/spk2utt_to_utt2spk.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+while(<>){
+    @A = split(" ", $_);
+    @A > 1 || die "Invalid line in spk2utt file: $_";
+    $s = shift @A;
+    foreach $u ( @A ) {
+        print "$u $s\n";
+    }
+}
+
+
--- a/examples/aishell4/s0/local/text_format.pl
+++ b/examples/aishell4/s0/local/text_format.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright Chao Weng
+
+# normalizations for hkust trascript
+# see the docs/trans-guidelines.pdf for details
+
+while (<STDIN>) {
+  @A = split(" ", $_);
+  if (@A == 1) {
+    next;
+  }
+  print $_
+}
--- a/examples/aishell4/s0/local/text_normalize.pl
+++ b/examples/aishell4/s0/local/text_normalize.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright Chao Weng
+
+# normalizations for hkust trascript
+# see the docs/trans-guidelines.pdf for details
+
+while (<STDIN>) {
+  @A = split(" ", $_);
+  print "$A[0] ";
+  for ($n = 1; $n < @A; $n++) {
+    $tmp = $A[$n];
+    if ($tmp =~ /<sil>/) {$tmp =~ s:<sil>::g;}
+    if ($tmp =~ /<%>/) {$tmp =~ s:<%>::g;}
+    if ($tmp =~ /<->/) {$tmp =~ s:<->::g;}
+    if ($tmp =~ /<\$>/) {$tmp =~ s:<\$>::g;}
+    if ($tmp =~ /<#>/) {$tmp =~ s:<#>::g;}
+    if ($tmp =~ /<_>/) {$tmp =~ s:<_>::g;}
+    if ($tmp =~ /<space>/) {$tmp =~ s:<space>::g;}
+    if ($tmp =~ /`/) {$tmp =~ s:`::g;}
+    if ($tmp =~ /&/) {$tmp =~ s:&::g;}
+    if ($tmp =~ /,/) {$tmp =~ s:,::g;}
+    if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);}
+    if ($tmp =~ /Ａ/) {$tmp =~ s:Ａ:A:g;}
+    if ($tmp =~ /ａ/) {$tmp =~ s:ａ:A:g;}
+    if ($tmp =~ /ｂ/) {$tmp =~ s:ｂ:B:g;}
+    if ($tmp =~ /ｃ/) {$tmp =~ s:ｃ:C:g;}
+    if ($tmp =~ /ｋ/) {$tmp =~ s:ｋ:K:g;}
+    if ($tmp =~ /ｔ/) {$tmp =~ s:ｔ:T:g;}
+    if ($tmp =~ /，/) {$tmp =~ s:，::g;}
+    if ($tmp =~ /丶/) {$tmp =~ s:丶::g;}
+    if ($tmp =~ /。/) {$tmp =~ s:。::g;}
+    if ($tmp =~ /、/) {$tmp =~ s:、::g;}
+    if ($tmp =~ /？/) {$tmp =~ s:？::g;}
+    if ($tmp =~ /·/) {$tmp =~ s:·::g;}
+    if ($tmp =~ /\*/) {$tmp =~ s:\*::g;}
+    if ($tmp =~ /！/) {$tmp =~ s:！::g;}
+    if ($tmp =~ /\$/) {$tmp =~ s:\$::g;}
+    if ($tmp =~ /\+/) {$tmp =~ s:\+::g;}
+    if ($tmp =~ /-/) {$tmp =~ s:-::g;}
+    if ($tmp =~ /\\/) {$tmp =~ s:\\::g;}
+    if ($tmp =~ /\?/) {$tmp =~ s:\?::g;}
+    if ($tmp =~ /￥/) {$tmp =~ s:￥::g;}
+    if ($tmp =~ /%/) {$tmp =~ s:%::g;}
+    if ($tmp =~ /\./) {$tmp =~ s:\.::g;}
+    if ($tmp =~ /</) {$tmp =~ s:<::g;}
+    if ($tmp =~ /＆/) {$tmp =~ s:＆::g;}
+    print "$tmp ";
+  }
+  print "\n";
+}
--- a/examples/aishell4/s0/local/utt2spk_to_spk2utt.pl
+++ b/examples/aishell4/s0/local/utt2spk_to_spk2utt.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# converts an utt2spk file to a spk2utt file.
+# Takes input from the stdin or from a file argument;
+# output goes to the standard out.
+
+if ( @ARGV > 1 ) {
+    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
+}
+
+while(<>){
+    @A = split(" ", $_);
+    @A == 2 || die "Invalid line in utt2spk file: $_";
+    ($u,$s) = @A;
+    if(!$seen_spk{$s}) {
+        $seen_spk{$s} = 1;
+        push @spklist, $s;
+    }
+    push (@{$spk_hash{$s}}, "$u");
+}
+foreach $s (@spklist) {
+    $l = join(' ',@{$spk_hash{$s}});
+    print "$s $l\n";
+}
--- a/examples/aishell4/s0/local/validate_data_dir.sh
+++ b/examples/aishell4/s0/local/validate_data_dir.sh
+#!/usr/bin/env bash
+
+cmd="$@"
+
+no_feats=false
+no_wav=false
+no_text=false
+no_spk_sort=false
+
+for x in `seq 4`; do
+  if [ "$1" == "--no-feats" ]; then
+    no_feats=true
+    shift;
+  fi
+  if [ "$1" == "--no-text" ]; then
+    no_text=true
+    shift;
+  fi
+  if [ "$1" == "--no-wav" ]; then
+    no_wav=true
+    shift;
+  fi
+  if [ "$1" == "--no-spk-sort" ]; then
+    no_spk_sort=true
+    shift;
+  fi
+done
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
+  echo "The --no-xxx options mean that the script does not require "
+  echo "xxx.scp to be present, but it will check it if it is present."
+  echo "--no-spk-sort means that the script does not require the utt2spk to be "
+  echo "sorted by the speaker-id in addition to being sorted by utterance-id."
+  echo "By default, utt2spk is expected to be sorted by both, which can be "
+  echo "achieved by making the speaker-id prefixes of the utterance-ids"
+  echo "e.g.: $0 data/train"
+  exit 1;
+fi
+
+data=$1
+
+if [ ! -d $data ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -f $data/images.scp ]; then
+  cmd=${cmd/--no-wav/}  # remove --no-wav if supplied
+  image/validate_data_dir.sh $cmd
+  exit $?
+fi
+
+for f in spk2utt utt2spk; do
+  if [ ! -f $data/$f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+  if [ ! -s $data/$f ]; then
+    echo "$0: empty file $f"
+    exit 1;
+  fi
+done
+
+! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
+  echo "$0: $data/utt2spk has wrong format." && exit;
+
+ns=$(wc -l < $data/spk2utt)
+if [ "$ns" == 1 ]; then
+  echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
+  echo "   Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
+  echo "   for more information."
+fi
+
+
+tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
+trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
+
+export LC_ALL=C
+
+function check_sorted_and_uniq {
+  ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
+  ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
+    echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
+}
+
+function partial_diff {
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
+  n1=`cat $1 | wc -l`
+  n2=`cat $2 | wc -l`
+  echo "[Lengths are $1=$n1 versus $2=$n2]"
+}
+
+check_sorted_and_uniq $data/utt2spk
+
+if ! $no_spk_sort; then
+  ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
+     echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
+     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
+fi
+
+check_sorted_and_uniq $data/spk2utt
+
+! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
+     <(local/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
+   echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
+
+cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
+
+if [ ! -f $data/text ] && ! $no_text; then
+  echo "$0: no such file $data/text (if this is by design, specify --no-text)"
+  exit 1;
+fi
+
+num_utts=`cat $tmpdir/utts | wc -l`
+if [ -f $data/text ]; then
+  local/validate_text.pl $data/text || exit 1;
+  check_sorted_and_uniq $data/text
+  text_len=`cat $data/text | wc -l`
+  illegal_sym_list="<s> </s> #0"
+  for x in $illegal_sym_list; do
+    if grep -w "$x" $data/text > /dev/null; then
+      echo "$0: Error: in $data, text contains illegal symbol $x"
+      exit 1;
+    fi
+  done
+  awk '{print $1}' < $data/text > $tmpdir/utts.txt
+  if ! cmp -s $tmpdir/utts{,.txt}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.txt}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
+  echo "$0: in directory $data, segments file exists but no wav.scp"
+  exit 1;
+fi
+
+
+if [ ! -f $data/wav.scp ] && ! $no_wav; then
+  echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
+  exit 1;
+fi
+
+if [ -f $data/wav.scp ]; then
+  check_sorted_and_uniq $data/wav.scp
+
+  if grep -E -q '^\S+\s+~' $data/wav.scp; then
+    # note: it's not a good idea to have any kind of tilde in wav.scp, even if
+    # part of a command, as it would cause compatibility problems if run by
+    # other users, but this used to be not checked for so we let it slide unless
+    # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
+    # would definitely cause problems as the fopen system call does not do
+    # tilde expansion.
+    echo "$0: Please do not use tilde (~) in your wav.scp."
+    exit 1;
+  fi
+
+  if [ -f $data/segments ]; then
+
+    check_sorted_and_uniq $data/segments
+    # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
+    ! cat $data/segments | \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
+      echo "$0: badly formatted segments file" && exit 1;
+
+    segments_len=`cat $data/segments | wc -l`
+    if [ -f $data/text ]; then
+      ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
+        echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
+        echo "$0: Lengths are $segments_len vs $num_utts" && \
+        exit 1
+    fi
+
+    cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
+    awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
+    if ! cmp -s $tmpdir/recordings{,.wav}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.wav}
+      exit 1;
+    fi
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
+      if ! cmp -s $tmpdir/recordings{,.r2fc}; then
+        echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/recordings{,.r2fc}
+        exit 1;
+      fi
+    fi
+  else
+    # No segments file -> assume wav.scp indexed by utterance.
+    cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
+    if ! cmp -s $tmpdir/utts{,.wav}; then
+      echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/utts{,.wav}
+      exit 1;
+    fi
+
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
+      if ! cmp -s $tmpdir/utts{,.r2fc}; then
+        echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/utts{,.r2fc}
+        exit 1;
+      fi
+    fi
+  fi
+fi
+
+if [ ! -f $data/feats.scp ] && ! $no_feats; then
+  echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
+  exit 1;
+fi
+
+if [ -f $data/feats.scp ]; then
+  check_sorted_and_uniq $data/feats.scp
+  cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
+  if ! cmp -s $tmpdir/utts{,.feats}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.feats}
+    exit 1;
+  fi
+fi
+
+
+if [ -f $data/cmvn.scp ]; then
+  check_sorted_and_uniq $data/cmvn.scp
+  cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.cmvn}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.cmvn}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2gender ]; then
+  check_sorted_and_uniq $data/spk2gender
+  ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
+     echo "$0: Mal-formed spk2gender file" && exit 1;
+  cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2gender}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2warp ]; then
+  check_sorted_and_uniq $data/spk2warp
+  ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed spk2warp file" && exit 1;
+  cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2warp}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/utt2warp ]; then
+  check_sorted_and_uniq $data/utt2warp
+  ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed utt2warp file" && exit 1;
+  cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
+  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
+  if ! cmp -s $tmpdir/utts{,.utt2warp}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2warp}
+    exit 1;
+  fi
+fi
+
+# check some optionally-required things
+for f in vad.scp utt2lang utt2uniq; do
+  if [ -f $data/$f ]; then
+    check_sorted_and_uniq $data/$f
+    if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
+      <( awk '{print $1}' $data/$f ); then
+      echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
+      exit 1;
+    fi
+  fi
+done
+
+
+if [ -f $data/utt2dur ]; then
+  check_sorted_and_uniq $data/utt2dur
+  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
+  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2dur}
+    exit 1;
+  fi
+  cat $data/utt2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
+fi
+
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
+
+if [ -f $data/reco2dur ]; then
+  check_sorted_and_uniq $data/reco2dur
+  cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
+  if [ -f $tmpdir/recordings ]; then
+    if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.reco2dur}
+    exit 1;
+    fi
+  else
+    if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/{utts,recordings.reco2dur}
+    exit 1;
+    fi
+  fi
+  cat $data/reco2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+fi
+
+
+echo "$0: Successfully validated data-directory $data"
--- a/examples/aishell4/s0/local/validate_text.pl
+++ b/examples/aishell4/s0/local/validate_text.pl
+#!/usr/bin/env perl
+#
+#===============================================================================
+# Copyright 2017  Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
+#                 Johns Hopkins University (author: Daniel Povey)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# validation script for data/<dataset>/text
+# to be called (preferably) from utils/validate_data_dir.sh
+use strict;
+use warnings;
+use utf8;
+use Fcntl qw< SEEK_SET >;
+
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+  use Encode qw(decode encode);
+  my $is_utf_compatible = 1;
+  my @unicode_lines;
+  my @raw_lines;
+  my $raw_text;
+  my $lineno = 0;
+  my $file = shift;
+
+  while (<$file>) {
+    $raw_text = $_;
+    last unless $raw_text;
+    if ($is_utf_compatible) {
+      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+      push @unicode_lines, $decoded_text;
+    } else {
+      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      ;
+    }
+    push @raw_lines, $raw_text;
+    $lineno += 1;
+  }
+
+  if (!$is_utf_compatible) {
+    return (0, @raw_lines);
+  } else {
+    return (1, @unicode_lines);
+  }
+}
+
+# check if the given unicode string contain unicode whitespaces
+# other than the usual four: TAB, LF, CR and SPACE
+sub validate_utf8_whitespaces {
+  my $unicode_lines = shift;
+  use feature 'unicode_strings';
+  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
+    my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
+    my @A = split(" ", $current_line);
+    my $utt_id = $A[0];
+    # we replace TAB, LF, CR, and SPACE
+    # this is to simplify the test
+    if ($current_line =~ /\x{000d}/) {
+      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
+      return 1;
+    }
+    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
+    if ($current_line =~/\s/) {
+      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
+      return 1;
+    }
+  }
+  return 0;
+}
+
+# checks if the text in the file (supplied as the argument) is utf-8 compatible
+# if yes, checks if it contains only allowed whitespaces. If no, then does not
+# do anything. The function seeks to the original position in the file after
+# reading the text.
+sub check_allowed_whitespace {
+  my $file = shift;
+  my $filename = shift;
+  my $pos = tell($file);
+  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
+  seek($file, $pos, SEEK_SET);
+  if ($is_utf) {
+    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
+    if ($has_invalid_whitespaces) {
+      print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n";
+      return 0;
+    }
+  }
+  return 1;
+}
+
+if(@ARGV != 1) {
+  die "Usage: validate_text.pl <text-file>\n" .
+      "e.g.: validate_text.pl data/train/text\n";
+}
+
+my $text = shift @ARGV;
+
+if (-z "$text") {
+  print STDERR "$0: ERROR: file '$text' is empty or does not exist\n";
+  exit 1;
+}
+
+if(!open(FILE, "<$text")) {
+  print STDERR "$0: ERROR: failed to open $text\n";
+  exit 1;
+}
+
+check_allowed_whitespace(\*FILE, $text) or exit 1;
+close(FILE);
--- a/examples/aishell4/s0/path.sh
+++ b/examples/aishell4/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/aishell4/s0/run.sh
+++ b/examples/aishell4/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=6
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+num_utts_per_shard=1000
+data_url=https://www.openslr.org/resources/111
+data_source=/home/work_nfs5_ssd/yhliang/data/aishell4
+# modify this to your AISHELL-4 data path
+
+nj=16
+dict=data/dict/lang_char.txt
+
+train_set=aishell4_train
+dev_set=aishell4_test
+test_sets=aishell4_test
+
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "stage -1: Data Download"
+  local/download_and_untar.sh ${data_source} ${data_url} train_L
+  local/download_and_untar.sh ${data_source} ${data_url} train_M
+  local/download_and_untar.sh ${data_source} ${data_url} train_S
+  local/download_and_untar.sh ${data_source} ${data_url} test
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/prepare_data.sh ${data_source} || exit 1;
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # remove the space between the text labels for Mandarin dataset
+  for x in ${train_set} ${test_sets}; do
+    cp data/${x}/text data/${x}/text.org
+    paste -d " " <(cut -d " " -f 1 data/${x}/text.org) <(cut -d " " -f 2 data/${x}/text.org \
+      | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") > data/${x}/text
+    rm data/${x}/text.org
+  done
+
+  tools/compute_cmvn_stats.py --num_workers 32 --train_config $train_config \
+    --in_scp data/${train_set}/wav.scp \
+    --out_cmvn data/$train_set/global_cmvn
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # Make train dict
+  echo "Make a dictionary"
+  mkdir -p $(dirname $dict)
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+  tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
+    | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # Prepare wenet required data
+  echo "Prepare data, prepare required format"
+  for x in $train_set ${test_sets}; do
+    tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+      --num_threads 32 --segments data/$x/segments \
+      data/$x/wav.scp data/$x/text $(realpath data/$x/shards) data/$x/data.list
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p $dir
+  INIT_FILE=$dir/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  # The number of gpus runing on each node/machine
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type shard \
+      --symbol_table $dict \
+      --train_data data/$train_set/data.list \
+      --cv_data data/${dev_set}/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts
+  }
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $dir  \
+      --num ${average_num}
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  for mode in ${decode_modes}; do
+  {
+    for test_set in ${test_sets}; do
+    {
+      test_dir=$dir/test_${mode}
+      mkdir -p $test_dir
+      python wenet/bin/recognize.py --gpu $(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f1) \
+        --mode $mode \
+        --config $dir/train.yaml \
+        --data_type shard \
+        --test_data data/${test_set}/data.list \
+        --checkpoint $decode_checkpoint \
+        --beam_size 10 \
+        --batch_size 1 \
+        --penalty 0.0 \
+        --dict $dict \
+        --ctc_weight $ctc_weight \
+        --result_file $test_dir/text \
+        ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+      python tools/compute-wer.py --char=1 --v=1 \
+        data/${test_set}/text $test_dir/text > $test_dir/wer
+    } &
+    done
+  }
+  done
+  wait
+
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip \
+    --output_quant_file $dir/final_quant.zip
+fi
+
--- a/examples/aishell4/s0/tools
+++ b/examples/aishell4/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/aishell4/s0/wenet
+++ b/examples/aishell4/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/chime4/s0/README.md
+++ b/examples/chime4/s0/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs
+* Decoding info: average_num 10
+
+|      decoding mode     | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch |
+|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:|
+| ctc_prefix_beam_search |   19.06%      |   21.17%      |   28.39%      |    29.16%     |
+|  attention_rescoring   |   17.92%      |   20.22%      |   27.40%      |    28.25%     |
--- a/examples/chime4/s0/conf/train_conformer.yaml
+++ b/examples/chime4/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 40
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 8
+
+grad_clip: 10
+accum_grad: 4
+max_epoch: 80
+log_interval: 200
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
--- a/examples/chime4/s0/local/chime4_format_dir.sh
+++ b/examples/chime4/s0/local/chime4_format_dir.sh
+#!/usr/bin/env bash
+
+# wujian@2020
+
+set -eu
+
+echo "$0: Formating chime4 data dir..."
+
+track=isolated_1ch_track
+data_dir=data/chime4
+
+mkdir -p $data_dir/{train,dev}
+
+cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \
+  $data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp
+cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \
+  $data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text
+
+cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp
+cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text
+
+echo "$0: Format $data_dir done"
--- a/examples/chime4/s0/local/chime4_gen_wav.sh
+++ b/examples/chime4/s0/local/chime4_gen_wav.sh
+#!/usr/bin/env bash
+
+# wujian@2020
+
+set -eu
+
+[ $# -ne 2 ] && echo "Script format error: $0 <data-dir> <dump-dir>" && exit 0
+
+data_dir=$1
+dump_dir=$2
+
+mkdir -p $dump_dir
+
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+echo "Orginal utterances (.wav + .wv1): $num_utts"
+
+# cat $data_dir/wav.scp | grep "sph2pipe" | \
+#   awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash
+
+cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp
+find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \
+  sed 's:\.wav::' > $data_dir/sph_wav.scp
+
+cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+echo "Wave utterances (.wav): $num_utts"
+
+echo "$0: Generate wav => $dump_dir done"
--- a/examples/chime4/s0/local/clean_wsj0_data_prep.sh
+++ b/examples/chime4/s0/local/clean_wsj0_data_prep.sh
+#!/usr/bin/env bash
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# Modified from Kaldi's chime4 recipe
+
+set -eu
+
+dataset=chime4
+
+. ./tools/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+wsj0=$1
+
+srcdir=$PWD/data/chime4/local
+dstdir=$PWD/data/$dataset
+local=$PWD/local
+utils=$PWD/utils
+sph2pipe=sph2pipe
+
+if [ ! `which sph2pipe` ]; then
+  echo "Could not find sph2pipe, install it first..."
+  mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
+  tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
+  gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
+  sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
+  cd ..
+fi
+
+mkdir -p $srcdir && cd $srcdir
+
+# This version for SI-84
+cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist
+
+# Now for the test sets.
+# $wsj0/wsj1/doc/indices/readme.doc
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0.
+
+# Nov'92 (330 utts, 5k vocab)
+cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
+# Sometimes this gets copied from the CD's with upcasing, don't know
+# why (could be older versions of the disks).
+find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist
+
+# Finding the transcript files:
+find -L $wsj0 -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id
+# 0 for clean condition
+for x in tr05 et05 dt05; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
+  cat ${x}_sph_tmp.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
+  cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in tr05 et05 dt05; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in tr05 et05 dt05; do
+  awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
+done
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it."
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.
+
+cat $wsj0/wsj0/doc/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+# return back
+cd -
+
+for x in et05 dt05 tr05; do
+  mkdir -p $dstdir/${x}_orig_clean
+  cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1
+  cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1
+done
+
+echo "Data preparation succeeded"
--- a/examples/chime4/s0/local/clean_wsj1_data_prep.sh
+++ b/examples/chime4/s0/local/clean_wsj1_data_prep.sh
+#!/usr/bin/env bash
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+set -eu
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be WSJ1 directory"
+  exit 1;
+fi
+
+wsj1=$1
+dir=$PWD/data/chime4/local
+odir=$PWD/data/chime4
+mkdir -p $dir
+local=$PWD/local
+sph2pipe=sph2pipe
+
+if [ ! `which sph2pipe` ]; then
+  echo "Could not find sph2pipe, install it first..."
+  mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
+  tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
+  gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
+  sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
+  cd ..
+fi
+
+cd $dir
+# This version for SI-200
+cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \
+ $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist
+
+nl=`cat train_si200.flist | wc -l`
+[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl"
+
+# Dev-set for Nov'93 (503 utts)
+cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
+  $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist
+
+# Finding the transcript files:
+for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+for x in train_si200 test_dev93; do
+   $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
+   cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl  dot_files.flist > $x.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si200 test_dev93; do
+   cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
+done
+
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si200 test_dev93; do
+  awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
+done
+
+# return back
+cd -
+
+for x in train_si200 test_dev93; do
+  mkdir -p $odir/${x}_wsj1_clean
+  cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1
+  cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1
+done
+
+echo "Data preparation WSJ1 succeeded"