delete soft link

a7785cc6 · Sugon_ldc · 9a2a05ca · a7785cc6 · a7785cc6 · a7785cc6
Commit a7785cc6 authored Mar 26, 2024 by Sugon_ldc
20 changed files
--- a/examples/aishell/s0/tools/perturb_data_dir_speed.sh
+++ b/examples/aishell/s0/tools/perturb_data_dir_speed.sh
+#!/bin/bash
+
+# 2020 @kamo-naoyuki
+# This file was copied from Kaldi and
+# I deleted parts related to wav duration
+# because we shouldn't use kaldi's command here
+# and we don't need the files actually.
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+#           2014  Tom Ko
+#           2018  Emotech LTD (author: Pawel Swietojanski)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+#
+# It generates the files which are used for perturbing the speed of the original data.
+
+export LC_ALL=C
+set -euo pipefail
+
+if [[ $# != 3 ]]; then
+    echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
+    echo "e.g.:"
+    echo " $0 0.9 data/train_si284 data/train_si284p"
+    exit 1
+fi
+
+factor=$1
+srcdir=$2
+destdir=$3
+label="sp"
+spk_prefix="${label}${factor}-"
+utt_prefix="${label}${factor}-"
+
+#check is sox on the path
+
+! command -v sox &>/dev/null && echo "sox: command not found" && exit 1;
+
+if [[ ! -f ${srcdir}/utt2spk ]]; then
+  echo "$0: no such file ${srcdir}/utt2spk"
+  exit 1;
+fi
+
+if [[ ${destdir} == "${srcdir}" ]]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+mkdir -p "${destdir}"
+
+<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map"
+<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map"
+<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map"
+if [[ ! -f ${srcdir}/utt2uniq ]]; then
+    <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq"
+else
+    <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq"
+fi
+
+
+<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \
+  utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt
+
+if [[ -f ${srcdir}/segments ]]; then
+
+  utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \
+      utils/apply_map.pl -f 2 "${destdir}"/reco_map | \
+          awk -v factor="${factor}" \
+            '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \
+            >"${destdir}"/segments
+
+  utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \
+      # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
+      awk -v factor="${factor}" \
+          '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
+            else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
+            else  {print wid " sox" $_ " -t wav - speed " factor " |"}}' \
+             > "${destdir}"/wav.scp
+  if [[ -f ${srcdir}/reco2file_and_channel ]]; then
+      utils/apply_map.pl -f 1 "${destdir}"/reco_map \
+       <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel
+  fi
+
+else # no segments->wav indexed by utterance.
+    if [[ -f ${srcdir}/wav.scp ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \
+         # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename"
+         awk -v factor="${factor}" \
+           '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
+             else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" }
+             else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \
+                 > "${destdir}"/wav.scp
+    fi
+fi
+
+if [[ -f ${srcdir}/text ]]; then
+    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
+fi
+if [[ -f ${srcdir}/spk2gender ]]; then
+    utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
+fi
+if [[ -f ${srcdir}/utt2lang ]]; then
+    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
+fi
+
+rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
+echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
+
+utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
--- a/examples/aishell/s0/tools/reduce_data_dir.sh
+++ b/examples/aishell/s0/tools/reduce_data_dir.sh
+#!/bin/bash
+
+# koried, 10/29/2012
+
+# Reduce a data set based on a list of turn-ids
+
+help_message="usage: $0 srcdir turnlist destdir"
+
+if [ $1 == "--help" ]; then
+    echo "${help_message}"
+    exit 0;
+fi
+
+if [ $# != 3 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+srcdir=$1
+reclist=$2
+destdir=$3
+
+if [ ! -f ${srcdir}/utt2spk ]; then
+echo "$0: no such file $srcdir/utt2spk"
+exit 1;
+fi
+
+function do_filtering {
+# assumes the utt2spk and spk2utt files already exist.
+    [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
+    [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
+    [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
+    [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
+    [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
+    [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
+    if [ -f ${srcdir}/segments ]; then
+        utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
+        awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
+        # The next line would override the command above for wav.scp, which would be incorrect.
+        [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
+        [ -f ${srcdir}/reco2file_and_channel ] && \
+            utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
+
+        # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
+        [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
+        rm ${destdir}/reco
+    fi
+    srcutts=$(wc -l < ${srcdir}/utt2spk)
+    destutts=$(wc -l < ${destdir}/utt2spk)
+    echo "Reduced #utt from $srcutts to $destutts"
+}
+
+mkdir -p ${destdir}
+
+# filter the utt2spk based on the set of recordings
+utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
+
+utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
+do_filtering;
--- a/examples/aishell/s0/tools/remove_longshortdata.py
+++ b/examples/aishell/s0/tools/remove_longshortdata.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='remove too long or too short data in format.data')
+    parser.add_argument('--data_file',
+                        type=str,
+                        help='input format data')
+    parser.add_argument('--output_data_file',
+                        type=str,
+                        help='output format data')
+    parser.add_argument('--min_input_len', type=float,
+                        default=0,
+                        help='minimum input seq length, in seconds for raw wav, \
+                            in frame numbers for feature data')
+    parser.add_argument('--max_input_len', type=float,
+                        default=20,
+                        help='maximum output seq length, in seconds for raw wav, \
+                            in frame numbers for feature data')
+    parser.add_argument('--min_output_len', type=float,
+                        default=0, help='minimum input seq length, in modeling units')
+    parser.add_argument('--max_output_len', type=float,
+                        default=500,
+                        help='maximum output seq length, in modeling units')
+    parser.add_argument('--min_output_input_ratio', type=float, default=0.05,
+                        help='minimum output seq length/output seq length ratio')
+    parser.add_argument('--max_output_input_ratio', type=float, default=10,
+                        help='maximum output seq length/output seq length ratio')
+    args = parser.parse_args()
+
+    data_file = args.data_file
+    output_data_file = args.output_data_file
+    min_input_len = args.min_input_len
+    max_input_len = args.max_input_len
+    min_output_len = args.min_output_len
+    max_output_len = args.max_output_len
+    min_output_input_ratio = args.min_output_input_ratio
+    max_output_input_ratio = args.max_output_input_ratio
+
+    with open(data_file, 'r') as f, open(output_data_file, 'w') as fout:
+        for l in f:
+            l = l.strip()
+            if l:
+                items = l.strip().split('\t')
+                token_shape = items[6]
+                feature_shape = items[2]
+                feat_len = float(feature_shape.split(':')[1].split(',')[0])
+                token_len = float(token_shape.split(':')[1].split(',')[0])
+                condition = [feat_len > min_input_len,
+                             feat_len < max_input_len,
+                             token_len > min_output_len,
+                             token_len < max_output_len,
+                             token_len / feat_len > min_output_input_ratio,
+                             token_len / feat_len < max_output_input_ratio,
+                             ]
+                if all(condition):
+                    fout.write('{}\n'.format(l))
+                    continue
--- a/examples/aishell/s0/tools/segment.py
+++ b/examples/aishell/s0/tools/segment.py
+#!/usr/bin/env python3
+# Copyright (c) 2021 Mobvoi Inc. (Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='generate segmented wav.scp')
+    parser.add_argument('--segments', required=True, help='segments file')
+    parser.add_argument('--input',
+                        required=True,
+                        help='origin wav.scp that not segmented')
+    parser.add_argument('--output',
+                        required=True,
+                        help='output segmented wav.scp')
+    wav_dic = {}
+    args = parser.parse_args()
+    ori_wav = args.input
+    segment_file = args.segments
+    wav_scp = args.output
+    with open(ori_wav, 'r') as ori:
+        for l in ori:
+            item = l.strip().split()
+            wav_dic[item[0]] = item[1]
+    with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement:
+        for l in sgement:
+            item = l.strip().split()
+            if item[1] in wav_dic:
+                item[1] = wav_dic[item[1]]
+                f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3]))
--- a/examples/aishell/s0/tools/setup_anaconda.sh
+++ b/examples/aishell/s0/tools/setup_anaconda.sh
+#!/usr/bin/env bash
+# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet)
+set -euo pipefail
+
+if [ -z "${PS1:-}" ]; then
+    PS1=__dummy__
+fi
+CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+
+if [ $# -gt 4 ]; then
+    echo "Usage: $0 [output] [conda-env-name] [python-version>]"
+    exit 1;
+elif [ $# -eq 3 ]; then
+    output_dir="$1"
+    name="$2"
+    PYTHON_VERSION="$3"
+elif [ $# -eq 2 ]; then
+    output_dir="$1"
+    name="$2"
+    PYTHON_VERSION=""
+elif [ $# -eq 1 ]; then
+    output_dir="$1"
+    name=""
+    PYTHON_VERSION=""
+elif [ $# -eq 0 ]; then
+    output_dir=venv
+    name=""
+    PYTHON_VERSION=""
+fi
+
+if [ -e activate_python.sh ]; then
+    echo "Warning: activate_python.sh already exists. It will be overwritten"
+fi
+
+if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then
+    if [ ! -e miniconda.sh ]; then
+        wget --tries=3 "${CONDA_URL}" -O miniconda.sh
+    fi
+
+    bash miniconda.sh -b -p "${output_dir}"
+fi
+
+# shellcheck disable=SC1090
+source "${output_dir}/etc/profile.d/conda.sh"
+conda deactivate
+
+# If the env already exists, skip recreation
+if [ -n "${name}" ] && ! conda activate ${name}; then
+    conda create -yn "${name}"
+fi
+conda activate ${name}
+
+if [ -n "${PYTHON_VERSION}" ]; then
+    conda install -y conda "python=${PYTHON_VERSION}"
+else
+    conda install -y conda
+fi
+
+conda install -y pip setuptools
+
+cat << EOF > activate_python.sh
+#!/usr/bin/env bash
+# THIS FILE IS GENERATED BY tools/setup_anaconda.sh
+if [ -z "\${PS1:-}" ]; then
+    PS1=__dummy__
+fi
+. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name}
+EOF
--- a/examples/aishell/s0/tools/sph2wav.sh
+++ b/examples/aishell/s0/tools/sph2wav.sh
+#!/bin/bash
+# convert sph scp to segmented wav scp
+nj=1
+. tools/parse_options.sh || exit 1;
+
+inscp=$1
+segments=$2
+outscp=$3
+data=$(dirname ${inscp})
+if [ $# -eq 4 ]; then
+  logdir=$4
+else
+  logdir=${data}/log
+fi
+mkdir -p ${logdir}
+
+sph2pipe_version="v2.5"
+if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then
+  echo "Download sph2pipe_${sph2pipe_version} ......"
+  wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \
+  wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \
+  tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools
+  cd tools/sph2pipe_${sph2pipe_version}/ && \
+        gcc -o sph2pipe  *.c -lm
+  cd -
+fi
+sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe
+[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
+sox=`which sox`
+[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1;
+
+cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2);
+    printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \
+   sort > $data/wav_ori.scp || exit 1;
+
+tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp
+sed -i 's/ /,/g' $data/wav_segments.scp
+sed -i 's/#/ /g' $data/wav_segments.scp
+
+rm -f $logdir/wav_*.slice
+rm -f $logdir/*.log
+split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_
+
+for slice in `ls $logdir/wav_*.slice`; do
+{
+    name=`basename -s .slice $slice`
+    mkdir -p ${data}/wavs/${name}
+    cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \
+        -v logdir=$logdir -v name=$name '{
+        during=$4-$3
+        cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during;
+        system(cmd)
+        printf("%s %s/%s.wav\n", $1, data, $1);
+        }' | \
+       sort > ${data}/wavs_${name}.scp || exit 1;
+} &
+done
+wait
+cat ${data}/wavs_*.scp > $outscp
+rm ${data}/wavs_*.scp
--- a/examples/aishell/s0/tools/spk2utt_to_utt2spk.pl
+++ b/examples/aishell/s0/tools/spk2utt_to_utt2spk.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+while(<>){
+    @A = split(" ", $_);
+    @A > 1 || die "Invalid line in spk2utt file: $_";
+    $s = shift @A;
+    foreach $u ( @A ) {
+        print "$u $s\n";
+    }
+}
+
+
--- a/examples/aishell/s0/tools/spm_decode
+++ b/examples/aishell/s0/tools/spm_decode
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True,
+                        help="sentencepiece model to use for decoding")
+    parser.add_argument("--input", default=None, help="input file to decode")
+    parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
+    args = parser.parse_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.input_format == "piece":
+        def decode(l):
+            return "".join(sp.DecodePieces(l))
+    elif args.input_format == "id":
+        def decode(l):
+            return "".join(sp.DecodeIds(l))
+    else:
+        raise NotImplementedError
+
+    def tok2int(tok):
+        # remap reference-side <unk> (represented as <<unk>>) to 0
+        return int(tok) if tok != "<<unk>>" else 0
+
+    if args.input is None:
+        h = sys.stdin
+    else:
+        h = open(args.input, "r", encoding="utf-8")
+    for line in h:
+        print(decode(line.split()))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/aishell/s0/tools/spm_encode
+++ b/examples/aishell/s0/tools/spm_encode
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True,
+                        help="sentencepiece model to use for encoding")
+    parser.add_argument("--inputs", nargs="+", default=['-'],
+                        help="input files to filter/encode")
+    parser.add_argument("--outputs", nargs="+", default=['-'],
+                        help="path to save encoded outputs")
+    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+    parser.add_argument("--min-len", type=int, metavar="N",
+                        help="filter sentence pairs with fewer than N tokens")
+    parser.add_argument("--max-len", type=int, metavar="N",
+                        help="filter sentence pairs with more than N tokens")
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(args.outputs), \
+        "number of input and output paths should match"
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.output_format == "piece":
+        def encode(l):
+            return sp.EncodeAsPieces(l)
+    elif args.output_format == "id":
+        def encode(l):
+            return list(map(str, sp.EncodeAsIds(l)))
+    else:
+        raise NotImplementedError
+
+    if args.min_len is not None or args.max_len is not None:
+        def valid(line):
+            return (
+                (args.min_len is None or len(line) >= args.min_len) and
+                (args.max_len is None or len(line) <= args.max_len)
+            )
+    else:
+        def valid(lines):
+            return True
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-" else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-" else sys.stdout
+            for output in args.outputs
+        ]
+
+        stats = {
+            "num_empty": 0,
+            "num_filtered": 0,
+        }
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        for i, lines in enumerate(zip(*inputs), start=1):
+            enc_lines = list(map(encode_line, lines))
+            if not any(enc_line is None for enc_line in enc_lines):
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(" ".join(enc_line), file=output_h)
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/aishell/s0/tools/spm_train
+++ b/examples/aishell/s0/tools/spm_train
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+import sys
+
+import sentencepiece as spm
+
+
+if __name__ == "__main__":
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
--- a/examples/aishell/s0/tools/subset_data_dir.sh
+++ b/examples/aishell/s0/tools/subset_data_dir.sh
+#!/usr/bin/env bash
+# Copyright 2010-2011  Microsoft Corporation
+#           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+
+# This script operates on a data directory, such as in data/train/.
+# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
+# for what these directories contain.
+
+# This script creates a subset of that data, consisting of some specified
+# number of utterances.  (The selected utterances are distributed evenly
+# throughout the file, by the program ./subset_scp.pl).
+
+# There are six options, none compatible with any other.
+
+# If you give the --per-spk option, it will attempt to select the supplied
+# number of utterances for each speaker (typically you would supply a much
+# smaller number in this case).
+
+# If you give the --speakers option, it selects a subset of n randomly
+# selected speakers.
+
+# If you give the --shortest option, it will give you the n shortest utterances.
+
+# If you give the --first option, it will just give you the n first utterances.
+
+# If you give the --last option, it will just give you the n last utterances.
+
+# If you give the --spk-list or --utt-list option, it reads the
+# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
+# in this case there is no <num-utt> positional parameter; see usage message.)
+
+
+shortest=false
+perspk=false
+speakers=false
+first_opt=
+spk_list=
+utt_list=
+
+expect_args=3
+case $1 in
+  --first|--last) first_opt=$1; shift ;;
+  --per-spk)  perspk=true; shift ;;
+  --shortest) shortest=true; shift ;;
+  --speakers) speakers=true; shift ;;
+  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
+  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
+  --*) echo "$0: invalid option '$1'"; exit 1
+esac
+
+if [ $# != $expect_args ]; then
+  echo "Usage:"
+  echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
+  echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
+  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
+  echo "By default, randomly selects <num-utt> utterances from the data directory."
+  echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
+  echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
+  echo "With --first, selects the first <num-utt> utterances"
+  echo "With --last, selects the last <num-utt> utterances"
+  echo "With --shortest, selects the shortest <num-utt> utterances."
+  echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
+  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
+  exit 1;
+fi
+
+srcdir=$1
+if [[ $spk_list || $utt_list ]]; then
+  numutt=
+  destdir=$2
+else
+  numutt=$2
+  destdir=$3
+fi
+
+export LC_ALL=C
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1
+fi
+
+if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
+  echo "$0: cannot subset to more utterances than you originally had."
+  exit 1
+fi
+
+if $shortest && [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: you selected --shortest but no feats.scp exist."
+  exit 1
+fi
+
+mkdir -p $destdir || exit 1
+
+if [[ $spk_list ]]; then
+  tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
+  tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
+elif [[ $utt_list ]]; then
+  tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
+  tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
+elif $speakers; then
+  tools/shuffle_list.pl < $srcdir/spk2utt |
+    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
+    sort > $destdir/spk2utt
+  tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
+elif $perspk; then
+  awk '{ n='$numutt'; printf("%s ",$1);
+         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+         for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
+         printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
+  tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
+else
+  if $shortest; then
+    # Select $numutt shortest utterances.
+    . ./path.sh
+    feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
+    sort -n -k2 $destdir/tmp.len |
+      awk '{print $1}' |
+      head -$numutt >$destdir/tmp.uttlist
+    tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
+    rm $destdir/tmp.uttlist $destdir/tmp.len
+  else
+    # Select $numutt random utterances.
+    tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
+  fi
+  tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
+fi
+
+# Perform filtering. utt2spk and spk2utt files already exist by this point.
+# Filter by utterance.
+[ -f $srcdir/feats.scp ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+[ -f $srcdir/vad.scp ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
+[ -f $srcdir/utt2lang ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+[ -f $srcdir/utt2dur ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
+[ -f $srcdir/utt2num_frames ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
+[ -f $srcdir/utt2uniq ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
+[ -f $srcdir/wav.scp ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
+[ -f $srcdir/utt2warp ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
+[ -f $srcdir/text ] &&
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
+
+# Filter by speaker.
+[ -f $srcdir/spk2warp ] &&
+  tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
+[ -f $srcdir/spk2gender ] &&
+  tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+[ -f $srcdir/cmvn.scp ] &&
+  tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
+
+# Filter by recording-id.
+if [ -f $srcdir/segments ]; then
+  tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
+  # Recording-ids are in segments.
+  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
+  # The next line overrides the command above for wav.scp, which would be incorrect.
+  #[ -f $srcdir/wav.scp ] &&
+  #  tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
+else
+  # No segments; recording-ids are in wav.scp.
+  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
+fi
+
+[ -f $srcdir/reco2file_and_channel ] &&
+  tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+[ -f $srcdir/reco2dur ] &&
+  tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
+
+# Filter the STM file for proper sclite scoring.
+# Copy over the comments from STM file.
+[ -f $srcdir/stm ] &&
+  (grep "^;;" $srcdir/stm
+   tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
+
+rm $destdir/reco
+
+# Copy frame_shift if present.
+[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
+
+srcutts=$(wc -l <$srcdir/utt2spk)
+destutts=$(wc -l <$destdir/utt2spk)
+echo "$0: reducing #utt from $srcutts to $destutts"
+exit 0
--- a/examples/aishell/s0/tools/subset_scp.pl
+++ b/examples/aishell/s0/tools/subset_scp.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This program selects a subset of N elements in the scp.
+
+# By default, it selects them evenly from throughout the scp, in order to avoid
+# selecting too many from the same speaker.  It prints them on the standard
+# output.
+# With the option --first, it just selects the N first utterances.
+# With the option --last, it just selects the N last utterances.
+
+# Last modified by JHU & HKUST @2013
+
+
+$quiet = 0;
+$first = 0;
+$last = 0;
+
+if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
+  shift;
+  $quiet = 1;
+}
+if (@ARGV > 0 && $ARGV[0] eq "--first") {
+  shift;
+  $first = 1;
+}
+if (@ARGV > 0 && $ARGV[0] eq "--last") {
+  shift;
+  $last = 1;
+}
+
+if(@ARGV < 2 ) {
+    die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
+        " --quiet  causes it to not die if N < num lines in scp.\n" .
+        " --first and --last make it equivalent to head or tail.\n" .
+        "See also: filter_scp.pl\n";
+}
+
+$N = shift @ARGV;
+if($N == 0) {
+    die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
+}
+$inscp = shift @ARGV;
+open(I, "<$inscp") || die "Opening input scp file $inscp";
+
+@F = ();
+while(<I>) {
+    push @F, $_;
+}
+$numlines = @F;
+if($N > $numlines) {
+  if ($quiet) {
+    $N = $numlines;
+  } else {
+    die "You requested from subset_scp.pl more elements than available: $N > $numlines";
+  }
+}
+
+sub select_n {
+  my ($start,$end,$num_needed) = @_;
+  my $diff = $end - $start;
+  if ($num_needed > $diff) {
+    die "select_n: code error";
+  }
+  if ($diff == 1 ) {
+    if ($num_needed  > 0) {
+      print $F[$start];
+    }
+  } else {
+    my $halfdiff = int($diff/2);
+    my $halfneeded = int($num_needed/2);
+    select_n($start, $start+$halfdiff, $halfneeded);
+    select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
+  }
+}
+
+if ( ! $first && ! $last) {
+  if ($N > 0) {
+    select_n(0, $numlines, $N);
+  }
+} else {
+  if ($first) { # --first option: same as head.
+    for ($n = 0; $n < $N; $n++) {
+      print $F[$n];
+    }
+  } else { # --last option: same as tail.
+    for ($n = @F - $N; $n < @F; $n++) {
+      print $F[$n];
+    }
+  }
+}
--- a/examples/aishell/s0/tools/sym2int.pl
+++ b/examples/aishell/s0/tools/sym2int.pl
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+$ignore_oov = 0;
+
+for($x = 0; $x < 2; $x++) {
+  if ($ARGV[0] eq "--map-oov") {
+    shift @ARGV;
+    $map_oov = shift @ARGV;
+    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
+      # disallow '-f', the empty string and anything ending in words.txt as the
+      # OOV symbol because these are likely command-line errors.
+      die "the --map-oov option requires an argument";
+    }
+  }
+  if ($ARGV[0] eq "-f") {
+    shift @ARGV;
+    $field_spec = shift @ARGV;
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec";
+    }
+  }
+}
+
+$symtab = shift @ARGV;
+if (!defined $symtab) {
+  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
+    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
+      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
+}
+open(F, "<$symtab") || die "Error opening symbol table file $symtab";
+while(<F>) {
+    @A = split(" ", $_);
+    @A == 2 || die "bad line in symbol table file: $_";
+    $sym2int{$A[0]} = $A[1] + 0;
+}
+
+if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
+  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
+  $map_oov = $sym2int{$map_oov};
+}
+
+$num_warning = 0;
+$max_warning = 20;
+
+while (<>) {
+  @A = split(" ", $_);
+  @B = ();
+  for ($n = 0; $n < @A; $n++) {
+    $a = $A[$n];
+    if ( (!defined $field_begin || $n >= $field_begin)
+         && (!defined $field_end || $n <= $field_end)) {
+      $i = $sym2int{$a};
+      if (!defined ($i)) {
+        if (defined $map_oov) {
+          if ($num_warning++ < $max_warning) {
+            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
+            if ($num_warning == $max_warning) {
+              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
+            }
+          }
+          $i = $map_oov;
+        } else {
+          $pos = $n+1;
+          die "sym2int.pl: undefined symbol $a (in position $pos)\n";
+        }
+      }
+      $a = $i;
+    }
+    push @B, $a;
+  }
+  print join(" ", @B);
+  print "\n";
+}
+if ($num_warning > 0) {
+  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
+}
+
+exit(0);
--- a/examples/aishell/s0/tools/text2token.py
+++ b/examples/aishell/s0/tools/text2token.py
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import re
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def exist_or_not(i, match_pos):
+    start_pos = None
+    end_pos = None
+    for pos in match_pos:
+        if pos[0] <= i < pos[1]:
+            start_pos = pos[0]
+            end_pos = pos[1]
+            break
+
+    return start_pos, end_pos
+
+def seg_char(sent):
+    pattern = re.compile(r'([\u4e00-\u9fa5])')
+    chars = pattern.split(sent)
+    chars = [w for w in chars if len(w.strip()) > 0]
+    return chars
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='convert raw text to tokenized text',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--nchar',
+                        '-n',
+                        default=1,
+                        type=int,
+                        help='number of characters to split, i.e., \
+                        aabb -> a a b b with -n 1 and aa bb with -n 2')
+    parser.add_argument('--skip-ncols',
+                        '-s',
+                        default=0,
+                        type=int,
+                        help='skip first n columns')
+    parser.add_argument('--space',
+                        default='<space>',
+                        type=str,
+                        help='space symbol')
+    parser.add_argument('--bpe-model',
+                        '-m',
+                        default=None,
+                        type=str,
+                        help='bpe model for english part')
+    parser.add_argument('--non-lang-syms',
+                        '-l',
+                        default=None,
+                        type=str,
+                        help='list of non-linguistic symobles,'
+                        ' e.g., <NOISE> etc.')
+    parser.add_argument('text',
+                        type=str,
+                        default=False,
+                        nargs='?',
+                        help='input text')
+    parser.add_argument('--trans_type',
+                        '-t',
+                        type=str,
+                        default="char",
+                        choices=["char", "phn", "cn_char_en_bpe"],
+                        help="""Transcript type. char/phn. e.g., for TIMIT
+                             FADG0_SI1279 -
+                             If trans_type is char, read from
+                             SI1279.WRD file -> "bricks are an alternative"
+                             Else if trans_type is phn,
+                             read from SI1279.PHN file ->
+                             "sil b r ih sil k s aa r er n aa l
+                             sil t er n ih sil t ih v sil" """)
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    rs = []
+    if args.non_lang_syms is not None:
+        with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f:
+            nls = [x.rstrip() for x in f.readlines()]
+            rs = [re.compile(re.escape(x)) for x in nls]
+
+    if args.bpe_model is not None:
+        import sentencepiece as spm
+        sp = spm.SentencePieceProcessor()
+        sp.load(args.bpe_model)
+
+    if args.text:
+        f = codecs.open(args.text, encoding="utf-8")
+    else:
+        f = codecs.getreader("utf-8")(
+            sys.stdin if is_python2 else sys.stdin.buffer)
+
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer)
+    line = f.readline()
+    n = args.nchar
+    while line:
+        x = line.split()
+        print(' '.join(x[:args.skip_ncols]), end=" ")
+        a = ' '.join(x[args.skip_ncols:])
+
+        # get all matched positions
+        match_pos = []
+        for r in rs:
+            i = 0
+            while i >= 0:
+                m = r.search(a, i)
+                if m:
+                    match_pos.append([m.start(), m.end()])
+                    i = m.end()
+                else:
+                    break
+
+        if len(match_pos) > 0:
+            chars = []
+            i = 0
+            while i < len(a):
+                start_pos, end_pos = exist_or_not(i, match_pos)
+                if start_pos is not None:
+                    chars.append(a[start_pos:end_pos])
+                    i = end_pos
+                else:
+                    chars.append(a[i])
+                    i += 1
+            a = chars
+
+        if (args.trans_type == "phn"):
+            a = a.split(" ")
+        elif args.trans_type == "cn_char_en_bpe":
+            b = seg_char(a)
+            a = []
+            for j in b:
+                # we use "▁" to instead of blanks among english words
+                # warning: here is "▁", not "_"
+                for l in j.strip().split("▁"):
+                    if not l.encode('UTF-8').isalpha():
+                        a.append(l)
+                    else:
+                        for k in sp.encode_as_pieces(l):
+                            a.append(k)
+        else:
+            a = [a[j:j + n] for j in range(0, len(a), n)]
+
+        a_flat = []
+        for z in a:
+            a_flat.append("".join(z))
+
+        a_chars = [z.replace(' ', args.space) for z in a_flat]
+        if (args.trans_type == "phn"):
+            a_chars = [z.replace("sil", args.space) for z in a_chars]
+        print(' '.join(a_chars))
+        line = f.readline()
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/aishell/s0/tools/utt2spk_to_spk2utt.pl
+++ b/examples/aishell/s0/tools/utt2spk_to_spk2utt.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# converts an utt2spk file to a spk2utt file.
+# Takes input from the stdin or from a file argument;
+# output goes to the standard out.
+
+if ( @ARGV > 1 ) {
+    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
+}
+
+while(<>){
+    @A = split(" ", $_);
+    @A == 2 || die "Invalid line in utt2spk file: $_";
+    ($u,$s) = @A;
+    if(!$seen_spk{$s}) {
+        $seen_spk{$s} = 1;
+        push @spklist, $s;
+    }
+    push (@{$spk_hash{$s}}, "$u");
+}
+foreach $s (@spklist) {
+    $l = join(' ',@{$spk_hash{$s}});
+    print "$s $l\n";
+}
--- a/examples/aishell/s0/tools/validate_data_dir.sh
+++ b/examples/aishell/s0/tools/validate_data_dir.sh
+#!/bin/bash
+
+cmd="$@"
+
+no_feats=false
+no_wav=false
+no_text=false
+no_spk_sort=false
+
+for x in `seq 4`; do
+  if [ "$1" == "--no-feats" ]; then
+    no_feats=true
+    shift;
+  fi
+  if [ "$1" == "--no-text" ]; then
+    no_text=true
+    shift;
+  fi
+  if [ "$1" == "--no-wav" ]; then
+    no_wav=true
+    shift;
+  fi
+  if [ "$1" == "--no-spk-sort" ]; then
+    no_spk_sort=true
+    shift;
+  fi
+done
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] <data-dir>"
+  echo "The --no-xxx options mean that the script does not require "
+  echo "xxx.scp to be present, but it will check it if it is present."
+  echo "--no-spk-sort means that the script does not require the utt2spk to be "
+  echo "sorted by the speaker-id in addition to being sorted by utterance-id."
+  echo "By default, utt2spk is expected to be sorted by both, which can be "
+  echo "achieved by making the speaker-id prefixes of the utterance-ids"
+  echo "e.g.: $0 data/train"
+  exit 1;
+fi
+
+data=$1
+
+if [ ! -d $data ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -f $data/images.scp ]; then
+  cmd=${cmd/--no-wav/}  # remove --no-wav if supplied
+  image/validate_data_dir.sh $cmd
+  exit $?
+fi
+
+for f in spk2utt utt2spk; do
+  if [ ! -f $data/$f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+  if [ ! -s $data/$f ]; then
+    echo "$0: empty file $f"
+    exit 1;
+  fi
+done
+
+! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
+  echo "$0: $data/utt2spk has wrong format." && exit;
+
+ns=$(wc -l < $data/spk2utt)
+if [ "$ns" == 1 ]; then
+  echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
+  echo "   Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
+  echo "   for more information."
+fi
+
+
+tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
+trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
+
+export LC_ALL=C
+
+function check_sorted_and_uniq {
+  ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
+  ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
+    echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
+}
+
+function partial_diff {
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
+  n1=`cat $1 | wc -l`
+  n2=`cat $2 | wc -l`
+  echo "[Lengths are $1=$n1 versus $2=$n2]"
+}
+
+check_sorted_and_uniq $data/utt2spk
+
+if ! $no_spk_sort; then
+  ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
+     echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
+     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
+fi
+
+check_sorted_and_uniq $data/spk2utt
+
+! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
+     <(tools/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
+   echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
+
+cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
+
+if [ ! -f $data/text ] && ! $no_text; then
+  echo "$0: no such file $data/text (if this is by design, specify --no-text)"
+  exit 1;
+fi
+
+num_utts=`cat $tmpdir/utts | wc -l`
+if [ -f $data/text ]; then
+  tools/validate_text.pl $data/text || exit 1;
+  check_sorted_and_uniq $data/text
+  text_len=`cat $data/text | wc -l`
+  illegal_sym_list="<s> </s> #0"
+  for x in $illegal_sym_list; do
+    if grep -w "$x" $data/text > /dev/null; then
+      echo "$0: Error: in $data, text contains illegal symbol $x"
+      exit 1;
+    fi
+  done
+  awk '{print $1}' < $data/text > $tmpdir/utts.txt
+  if ! cmp -s $tmpdir/utts{,.txt}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.txt}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
+  echo "$0: in directory $data, segments file exists but no wav.scp"
+  exit 1;
+fi
+
+
+if [ ! -f $data/wav.scp ] && ! $no_wav; then
+  echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
+  exit 1;
+fi
+
+if [ -f $data/wav.scp ]; then
+  check_sorted_and_uniq $data/wav.scp
+
+  if grep -E -q '^\S+\s+~' $data/wav.scp; then
+    # note: it's not a good idea to have any kind of tilde in wav.scp, even if
+    # part of a command, as it would cause compatibility problems if run by
+    # other users, but this used to be not checked for so we let it slide unless
+    # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
+    # would definitely cause problems as the fopen system call does not do
+    # tilde expansion.
+    echo "$0: Please do not use tilde (~) in your wav.scp."
+    exit 1;
+  fi
+
+  if [ -f $data/segments ]; then
+
+    check_sorted_and_uniq $data/segments
+    # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
+    ! cat $data/segments | \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
+      echo "$0: badly formatted segments file" && exit 1;
+
+    segments_len=`cat $data/segments | wc -l`
+    if [ -f $data/text ]; then
+      ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
+        echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
+        echo "$0: Lengths are $segments_len vs $num_utts" && \
+        exit 1
+    fi
+
+    cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
+    awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
+    if ! cmp -s $tmpdir/recordings{,.wav}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.wav}
+      exit 1;
+    fi
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
+      if ! cmp -s $tmpdir/recordings{,.r2fc}; then
+        echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/recordings{,.r2fc}
+        exit 1;
+      fi
+    fi
+  else
+    # No segments file -> assume wav.scp indexed by utterance.
+    cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
+    if ! cmp -s $tmpdir/utts{,.wav}; then
+      echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/utts{,.wav}
+      exit 1;
+    fi
+
+    if [ -f $data/reco2file_and_channel ]; then
+      # this file is needed only for ctm scoring; it's indexed by recording-id.
+      check_sorted_and_uniq $data/reco2file_and_channel
+      ! cat $data/reco2file_and_channel | \
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
+                if ( NF == 3 && $3 == "1" ) {
+                  warning_issued = 1;
+                } else {
+                  print "Bad line ", $0; exit 1;
+                }
+              }
+            }
+            END {
+              if (warning_issued == 1) {
+                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
+              }
+            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
+      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
+      if ! cmp -s $tmpdir/utts{,.r2fc}; then
+        echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
+        echo "$0: differ, partial diff is:"
+        partial_diff $tmpdir/utts{,.r2fc}
+        exit 1;
+      fi
+    fi
+  fi
+fi
+
+if [ ! -f $data/feats.scp ] && ! $no_feats; then
+  echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
+  exit 1;
+fi
+
+if [ -f $data/feats.scp ]; then
+  check_sorted_and_uniq $data/feats.scp
+  cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
+  if ! cmp -s $tmpdir/utts{,.feats}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.feats}
+    exit 1;
+  fi
+fi
+
+
+if [ -f $data/cmvn.scp ]; then
+  check_sorted_and_uniq $data/cmvn.scp
+  cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.cmvn}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.cmvn}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2gender ]; then
+  check_sorted_and_uniq $data/spk2gender
+  ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
+     echo "$0: Mal-formed spk2gender file" && exit 1;
+  cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2gender}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/spk2warp ]; then
+  check_sorted_and_uniq $data/spk2warp
+  ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed spk2warp file" && exit 1;
+  cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
+    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/speakers{,.spk2warp}
+    exit 1;
+  fi
+fi
+
+if [ -f $data/utt2warp ]; then
+  check_sorted_and_uniq $data/utt2warp
+  ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
+     echo "$0: Mal-formed utt2warp file" && exit 1;
+  cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
+  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
+  if ! cmp -s $tmpdir/utts{,.utt2warp}; then
+    echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2warp}
+    exit 1;
+  fi
+fi
+
+# check some optionally-required things
+for f in vad.scp utt2lang utt2uniq; do
+  if [ -f $data/$f ]; then
+    check_sorted_and_uniq $data/$f
+    if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
+      <( awk '{print $1}' $data/$f ); then
+      echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
+      exit 1;
+    fi
+  fi
+done
+
+
+if [ -f $data/utt2dur ]; then
+  check_sorted_and_uniq $data/utt2dur
+  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
+  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2dur}
+    exit 1;
+  fi
+  cat $data/utt2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
+fi
+
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
+
+if [ -f $data/reco2dur ]; then
+  check_sorted_and_uniq $data/reco2dur
+  cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
+  if [ -f $tmpdir/recordings ]; then
+    if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/recordings{,.reco2dur}
+    exit 1;
+    fi
+  else
+    if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
+      echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
+      echo "$0: differ, partial diff is:"
+      partial_diff $tmpdir/{utts,recordings.reco2dur}
+    exit 1;
+    fi
+  fi
+  cat $data/reco2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+fi
+
+
+echo "$0: Successfully validated data-directory $data"
--- a/examples/aishell/s0/tools/validate_dict_dir.pl
+++ b/examples/aishell/s0/tools/validate_dict_dir.pl
+#!/usr/bin/env perl
+
+# Apache 2.0.
+# Copyright  2012 Guoguo Chen
+#            2015 Daniel Povey
+#            2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#
+# Validation script for 'dict' directories (e.g. data/local/dict)
+
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+  use Encode qw(decode encode);
+  my $is_utf_compatible = 1;
+  my @unicode_lines;
+  my @raw_lines;
+  my $raw_text;
+  my $lineno = 0;
+  my $file = shift;
+
+  while (<$file>) {
+    $raw_text = $_;
+    last unless $raw_text;
+    if ($is_utf_compatible) {
+      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+      push @unicode_lines, $decoded_text;
+    } else {
+      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      ;
+    }
+    push @raw_lines, $raw_text;
+    $lineno += 1;
+  }
+
+  if (!$is_utf_compatible) {
+    return (0, @raw_lines);
+  } else {
+    return (1, @unicode_lines);
+  }
+}
+
+# check if the given unicode string contain unicode whitespaces
+# other than the usual four: TAB, LF, CR and SPACE
+sub validate_utf8_whitespaces {
+  my $unicode_lines = shift;
+  use feature 'unicode_strings';
+  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
+    my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
+    my @A = split(" ", $current_line);
+    my $utt_id = $A[0];
+    # we replace TAB, LF, CR, and SPACE
+    # this is to simplify the test
+    if ($current_line =~ /\x{000d}/) {
+      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
+      return 1;
+    }
+    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
+    if ($current_line =~/\s/) {
+      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
+      return 1;
+    }
+  }
+  return 0;
+}
+
+# checks if the text in the file (supplied as the argument) is utf-8 compatible
+# if yes, checks if it contains only allowed whitespaces. If no, then does not
+# do anything. The function seeks to the original position in the file after
+# reading the text.
+sub check_allowed_whitespace {
+  my $file = shift;
+  my $pos = tell($file);
+  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
+  seek($file, $pos, SEEK_SET);
+  if ($is_utf) {
+    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
+    print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
+    if ($has_invalid_whitespaces) {
+      print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
+      return 0;
+    } else {
+      print "--> text contains only allowed whitespaces\n";
+    }
+  } else {
+    print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
+  }
+  return 1;
+}
+
+
+if(@ARGV != 1) {
+  die "Usage: validate_dict_dir.pl <dict-dir>\n" .
+      "e.g.: validate_dict_dir.pl data/local/dict\n";
+}
+
+$dict = shift @ARGV;
+$dict =~ s:/$::;
+
+$exit = 0;
+$success = 1;  # this is re-set each time we read a file.
+
+sub set_to_fail { $exit = 1; $success = 0; }
+
+# Checking silence_phones.txt -------------------------------
+print "Checking $dict/silence_phones.txt ...\n";
+if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
+if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
+$idx = 1;
+%silence = ();
+$crlf = 1;
+
+print "--> reading $dict/silence_phones.txt\n";
+check_allowed_whitespace(\*S) || set_to_fail();
+while(<S>) {
+  if (! s/\n$//) {
+    print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
+    set_to_fail();
+  }
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
+  }
+  my @col = split(" ", $_);
+  if (@col == 0) {
+    set_to_fail();
+    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
+  }
+  foreach(0 .. @col-1) {
+    my $p = $col[$_];
+    if($silence{$p}) {
+      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
+    } else {
+      $silence{$p} = 1;
+    }
+    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
+    # problems with word-position-dependent systems, and <eps> is obviously
+    # confusable with epsilon.
+    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
+      set_to_fail();
+      print "--> ERROR: phone \"$p\" has disallowed written form\n";
+    }
+  }
+  $idx ++;
+}
+close(S);
+$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
+print "\n";
+
+# Checking optional_silence.txt -------------------------------
+print "Checking $dict/optional_silence.txt ...\n";
+if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
+if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
+$idx = 1;
+$success = 1;
+$crlf = 1;
+print "--> reading $dict/optional_silence.txt\n";
+check_allowed_whitespace(\*OS) or exit 1;
+while(<OS>) {
+  chomp;
+  my @col = split(" ", $_);
+  if ($idx > 1 or @col > 1) {
+    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
+  } elsif (!$silence{$col[0]}) {
+    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
+  }
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
+  }
+  $idx ++;
+}
+close(OS);
+$success == 0 || print "--> $dict/optional_silence.txt is OK\n";
+print "\n";
+
+# Checking nonsilence_phones.txt -------------------------------
+print "Checking $dict/nonsilence_phones.txt ...\n";
+if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
+if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
+$idx = 1;
+%nonsilence = ();
+$success = 1;
+$crlf = 1;
+print "--> reading $dict/nonsilence_phones.txt\n";
+check_allowed_whitespace(\*NS) or set_to_fail();
+while(<NS>) {
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
+  }
+  if (! s/\n$//) {
+    print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
+    set_to_fail();
+  }
+  my @col = split(" ", $_);
+  if (@col == 0) {
+    set_to_fail();
+    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
+  }
+  foreach(0 .. @col-1) {
+    my $p = $col[$_];
+    if($nonsilence{$p}) {
+      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
+    } else {
+      $nonsilence{$p} = 1;
+    }
+    # phones that start with the pound sign/hash may be mistaken for
+    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
+    # problems with word-position-dependent systems, and <eps> is obviously
+    # confusable with epsilon.
+    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
+      set_to_fail();
+      print "--> ERROR: phone \"$p\" has disallowed written form\n";
+    }
+  }
+  $idx ++;
+}
+close(NS);
+$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
+print "\n";
+
+# Checking disjoint -------------------------------
+sub intersect {
+  my ($a, $b) = @_;
+  @itset = ();
+  %itset = ();
+  foreach(keys %$a) {
+    if(exists $b->{$_} and !$itset{$_}) {
+      push(@itset, $_);
+      $itset{$_} = 1;
+    }
+  }
+  return @itset;
+}
+
+print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
+@itset = intersect(\%silence, \%nonsilence);
+if(@itset == 0) {print "--> disjoint property is OK.\n";}
+else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
+print "\n";
+
+
+sub check_lexicon {
+  my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
+  print "Checking $lex\n";
+  !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
+  my %seen_line = {};
+  $idx = 1; $success = 1; $crlf = 1;
+  print "--> reading $lex\n";
+  check_allowed_whitespace(\*L) or set_to_fail();
+  while (<L>) {
+    if ($crlf == 1 && m/\r/) {
+      print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
+      set_to_fail();
+      $crlf = 0;
+    }
+    if (defined $seen_line{$_}) {
+      print "--> ERROR: line '$_' of $lex is repeated\n";
+      set_to_fail();
+    }
+    $seen_line{$_} = 1;
+    if (! s/\n$//) {
+      print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
+      set_to_fail();
+    }
+    my @col = split(" ", $_);
+    $word = shift @col;
+    if (!defined $word) {
+      print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
+    }
+    if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
+      print "--> ERROR: lexicon.txt contains forbidden word $word\n";
+      set_to_fail();
+    }
+    for ($n = 0; $n < $num_prob_cols; $n++) {
+      $prob = shift @col;
+      if (!($prob > 0.0 && $prob <= 1.0)) {
+        print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
+        set_to_fail();
+      }
+    }
+    for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
+    if (@col == 0) {
+      print "--> ERROR: lexicon.txt contains word $word with empty ";
+      print "pronunciation.\n";
+      set_to_fail();
+    }
+    foreach (0 .. @col-1) {
+      if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
+        print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
+        print "(line $idx)\n";
+        set_to_fail();
+      }
+    }
+    $idx ++;
+  }
+  close(L);
+  $success == 0 || print "--> $lex is OK\n";
+  print "\n";
+}
+
+if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
+if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
+if (-f "$dict/lexiconp_silprob.txt") {
+  # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
+  # exist.
+  check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
+  if (-f "$dict/silprob.txt") {
+    !open(SP, "<$dict/silprob.txt") &&
+      print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
+      $crlf = 1;
+    while (<SP>) {
+      if ($crlf == 1 && m/\r/) {
+        print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
+        set_to_fail();
+        $crlf = 0;
+      }
+      chomp; my @col = split;
+      @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
+      if ($col[0] eq "<s>" || $col[0] eq "overall") {
+        if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
+          set_to_fail();
+          print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
+        }
+      } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
+        if ($col[1] <= 0.0) {
+          set_to_fail();
+          print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
+        }
+      } else {
+        print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
+        set_to_fail();
+      }
+    }
+    close(SP);
+  } else {
+    set_to_fail();
+    print "--> ERROR: expecting $dict/silprob.txt to exist\n";
+  }
+}
+
+if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
+  print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
+  set_to_fail();
+}
+
+sub check_lexicon_pair {
+  my ($lex1, $num_prob_cols1, $num_skipped_cols1,
+      $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
+  # We have checked individual lexicons already.
+  open(L1, "<$lex1"); open(L2, "<$lex2");
+  print "Checking lexicon pair $lex1 and $lex2\n";
+  my $line_num = 0;
+  while(<L1>) {
+    $line_num++;
+    @A = split;
+    $line_B = <L2>;
+    if (!defined $line_B) {
+      print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
+      set_to_fail(); last;
+    }
+    @B = split(" ", $line_B);
+    # Check if the word matches.
+    if ($A[0] ne $B[0]) {
+      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
+      set_to_fail(); last;
+    }
+    shift @A; shift @B;
+    for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
+    for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
+    # Check if the pronunciation matches
+    if (join(" ", @A) ne join(" ", @B)) {
+      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
+      set_to_fail(); last;
+    }
+  }
+  $line_B = <L2>;
+  if (defined $line_B && $exit == 0) {
+    print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
+    set_to_fail();
+  }
+  $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
+}
+
+# If more than one lexicon exist, we have to check if they correspond to each
+# other. It could be that the user overwrote one and we need to regenerate the
+# other, but we do not know which is which.
+if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
+  check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
+}
+if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
+  check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
+                     "$dict/lexiconp_silprob.txt", 2, 2);
+}
+
+# Checking extra_questions.txt -------------------------------
+%distinguished = (); # Keep track of all phone-pairs including nonsilence that
+                     # are distinguished (split apart) by extra_questions.txt,
+                     # as $distinguished{$p1,$p2} = 1.  This will be used to
+                     # make sure that we don't have pairs of phones on the same
+                     # line in nonsilence_phones.txt that can never be
+                     # distinguished from each other by questions.  (If any two
+                     # phones appear on the same line in nonsilence_phones.txt,
+                     # they share a tree root, and since the automatic
+                     # question-building treats all phones that appear on the
+                     # same line of nonsilence_phones.txt as being in the same
+                     # group, we can never distinguish them without resorting to
+                     # questions in extra_questions.txt.
+print "Checking $dict/extra_questions.txt ...\n";
+if (-s "$dict/extra_questions.txt") {
+  if (!open(EX, "<$dict/extra_questions.txt")) {
+    set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
+  }
+  $idx = 1;
+  $success = 1;
+  $crlf = 1;
+  print "--> reading $dict/extra_questions.txt\n";
+  check_allowed_whitespace(\*EX) or set_to_fail();
+  while(<EX>) {
+    if ($crlf == 1 && m/\r/) {
+      print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
+      set_to_fail();
+      $crlf = 0;
+    }
+    if (! s/\n$//) {
+      print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
+      set_to_fail();
+    }
+    my @col = split(" ", $_);
+    if (@col == 0) {
+      set_to_fail();  print "--> ERROR: empty line in $dict/extra_questions.txt\n";
+    }
+    foreach (0 .. @col-1) {
+      if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
+        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n";
+      }
+      $idx ++;
+    }
+    %col_hash = ();
+    foreach $p (@col) { $col_hash{$p} = 1; }
+    foreach $p1 (@col) {
+      # Update %distinguished hash.
+      foreach $p2 (keys %nonsilence) {
+        if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
+                                       # in this question (and in nonsilence
+                                       # phones)... mark p1,p2 as being split apart
+          $distinguished{$p1,$p2} = 1;
+          $distinguished{$p2,$p1} = 1;
+        }
+      }
+    }
+  }
+  close(EX);
+  $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
+} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
+
+if (-f "$dict/nonterminals.txt") {
+  open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
+  my %nonterminals = ();
+  my $line_number = 1;
+  while (<NT>) {
+    chop;
+    my @line = split(" ", $_);
+    if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
+      print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1;
+    }
+    $nonterminals{$line[0]} = 1;
+    $line_number++;
+  }
+  print "--> $dict/nonterminals.txt is OK\n";
+}
+
+
+# check nonsilence_phones.txt again for phone-pairs that are never
+# distnguishable.  (note: this situation is normal and expected for silence
+# phones, so we don't check it.)
+if(!open(NS, "<$dict/nonsilence_phones.txt")) {
+  print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
+}
+
+$num_warn_nosplit = 0;
+$num_warn_nosplit_limit = 10;
+while(<NS>) {
+  my @col = split(" ", $_);
+  foreach $p1 (@col) {
+    foreach $p2 (@col) {
+      if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
+        set_to_fail();
+        if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
+          print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
+        }
+        if ($num_warn_nosplit == $num_warn_nosplit_limit) {
+          print "... Not warning any more times about this issue.\n";
+        }
+        if ($num_warn_nosplit == 0) {
+          print "    (note: we started checking for this only recently.  You can still build a system but\n";
+          print "     phones $p1 and $p2 will be acoustically indistinguishable).\n";
+        }
+        $num_warn_nosplit++;
+      }
+    }
+  }
+}
+
+
+if ($exit == 1) {
+  print "--> ERROR validating dictionary directory $dict (see detailed error ";
+  print "messages above)\n\n";
+  exit 1;
+} else {
+  print "--> SUCCESS [validating dictionary directory $dict]\n\n";
+}
+
+exit 0;
--- a/examples/aishell/s0/tools/validate_text.pl
+++ b/examples/aishell/s0/tools/validate_text.pl
+#!/usr/bin/env perl
+#
+#===============================================================================
+# Copyright 2017  Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
+#                 Johns Hopkins University (author: Daniel Povey)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# validation script for data/<dataset>/text
+# to be called (preferably) from utils/validate_data_dir.sh
+use strict;
+use warnings;
+use utf8;
+use Fcntl qw< SEEK_SET >;
+
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+  use Encode qw(decode encode);
+  my $is_utf_compatible = 1;
+  my @unicode_lines;
+  my @raw_lines;
+  my $raw_text;
+  my $lineno = 0;
+  my $file = shift;
+
+  while (<$file>) {
+    $raw_text = $_;
+    last unless $raw_text;
+    if ($is_utf_compatible) {
+      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+      push @unicode_lines, $decoded_text;
+    } else {
+      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      ;
+    }
+    push @raw_lines, $raw_text;
+    $lineno += 1;
+  }
+
+  if (!$is_utf_compatible) {
+    return (0, @raw_lines);
+  } else {
+    return (1, @unicode_lines);
+  }
+}
+
+# check if the given unicode string contain unicode whitespaces
+# other than the usual four: TAB, LF, CR and SPACE
+sub validate_utf8_whitespaces {
+  my $unicode_lines = shift;
+  use feature 'unicode_strings';
+  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
+    my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
+    my @A = split(" ", $current_line);
+    my $utt_id = $A[0];
+    # we replace TAB, LF, CR, and SPACE
+    # this is to simplify the test
+    if ($current_line =~ /\x{000d}/) {
+      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
+      return 1;
+    }
+    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
+    if ($current_line =~/\s/) {
+      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
+      return 1;
+    }
+  }
+  return 0;
+}
+
+# checks if the text in the file (supplied as the argument) is utf-8 compatible
+# if yes, checks if it contains only allowed whitespaces. If no, then does not
+# do anything. The function seeks to the original position in the file after
+# reading the text.
+sub check_allowed_whitespace {
+  my $file = shift;
+  my $filename = shift;
+  my $pos = tell($file);
+  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
+  seek($file, $pos, SEEK_SET);
+  if ($is_utf) {
+    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
+    if ($has_invalid_whitespaces) {
+      print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n";
+      return 0;
+    }
+  }
+  return 1;
+}
+
+if(@ARGV != 1) {
+  die "Usage: validate_text.pl <text-file>\n" .
+      "e.g.: validate_text.pl data/train/text\n";
+}
+
+my $text = shift @ARGV;
+
+if (-z "$text") {
+  print STDERR "$0: ERROR: file '$text' is empty or does not exist\n";
+  exit 1;
+}
+
+if(!open(FILE, "<$text")) {
+  print STDERR "$0: ERROR: failed to open $text\n";
+  exit 1;
+}
+
+check_allowed_whitespace(\*FILE, $text) or exit 1;
+close(FILE);
--- a/examples/aishell/s0/tools/wav2dur.py
+++ b/examples/aishell/s0/tools/wav2dur.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+import sys
+
+import torchaudio
+torchaudio.set_audio_backend("sox_io")
+
+scp = sys.argv[1]
+dur_scp = sys.argv[2]
+
+with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
+    cnt = 0
+    total_duration = 0
+    for l in f:
+        items = l.strip().split()
+        wav_id = items[0]
+        fname = items[1]
+        cnt += 1
+        waveform, rate = torchaudio.load(fname)
+        frames = len(waveform[0])
+        duration = frames / float(rate)
+        total_duration += duration
+        fout.write('{} {}\n'.format(wav_id, duration))
+    print('process {} utts'.format(cnt))
+    print('total {} s'.format(total_duration))
--- a/examples/aishell/s0/tools/wav_to_duration.sh
+++ b/examples/aishell/s0/tools/wav_to_duration.sh
+#!/bin/bash
+# split the wav scp, calculate duration and merge
+nj=4
+. tools/parse_options.sh || exit 1;
+
+inscp=$1
+outscp=$2
+data=$(dirname ${inscp})
+if [ $# -eq 3 ]; then
+  logdir=$3
+else
+  logdir=${data}/log
+fi
+mkdir -p ${logdir}
+
+rm -f $logdir/wav_*.slice
+rm -f $logdir/wav_*.shape
+split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_
+
+for slice in `ls $logdir/wav_*.slice`; do
+{
+    name=`basename -s .slice $slice`
+    tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log
+} &
+done
+wait
+cat $logdir/wav_*.shape > $outscp