delete soft link

a7785cc6 · Sugon_ldc · 9a2a05ca · 9a2a05ca · a7785cc6 · a7785cc6
Commit a7785cc6 authored Mar 26, 2024 by Sugon_ldc
20 changed files
--- a/examples/aishell/s0/tools
+++ b/examples/aishell/s0/tools
-../../../tools/
\ No newline at end of file
--- a/examples/aishell/s0/tools/alignment.sh
+++ b/examples/aishell/s0/tools/alignment.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=0
+
+nj=16
+feat_dir=raw_wav
+dict=data/dict/lang_char.txt
+
+dir=exp/
+config=$dir/train.yaml
+checkpoint=
+checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt
+config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml
+set=
+ali_format=$feat_dir/$set/format.data
+ali_format=format.data
+ali_result=$dir/ali
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    nj=32
+    # Prepare required data for ctc alignment
+    echo "Prepare data, prepare required format"
+    for x in $set; do
+        tools/format_data.sh --nj ${nj} \
+            --feat-type wav --feat $feat_dir/$x/wav.scp \
+            $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp
+
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Test model, please specify the model you want to use by --checkpoint
+        python wenet/bin/alignment_deprecated.py --gpu -1 \
+            --config $config \
+            --input_file $ali_format \
+            --checkpoint $checkpoint \
+            --batch_size 1 \
+            --dict $dict \
+            --result_file $ali_result \
+
+fi
+
+
--- a/examples/aishell/s0/tools/analyze_dataset.py
+++ b/examples/aishell/s0/tools/analyze_dataset.py
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Analyze Dataset, Duration/TextLength/Speed etc.
+
+Usage:
+. ./path.sh && python3 tools/analyze_dataset.py \
+    --data_type "shard" \
+    --data_list data/test/data.list \
+    --output_dir exp/analyze_test \
+    --num_thread 32
+"""
+
+import os
+import json
+import math
+import time
+import numpy
+import logging
+import librosa
+import tarfile
+import argparse
+import torchaudio
+import multiprocessing
+
+from wenet.utils.file_utils import read_lists
+from wenet.dataset.processor import AUDIO_FORMAT_SETS
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Analyze dataset')
+    parser.add_argument('--data_type',
+                        default='wav_scp',
+                        choices=['wav_scp', 'raw', 'shard'],
+                        help='dataset type')
+    parser.add_argument('--output_dir', type=str,
+                        default="exp", help='write info to output dir')
+    parser.add_argument('--data_list', default=None,
+                        help='used in raw/shard mode')
+    parser.add_argument('--wav_scp', default=None,
+                        help='used in wav_scp mode')
+    parser.add_argument('--text', default=None,
+                        help='used in wav_scp mode')
+    parser.add_argument('--num_thread', type=int,
+                        default=4, help='number of threads')
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def analyze(datas, output_file, thread_id):
+    with open(output_file, "w", encoding='utf8') as f:
+        for i, data in enumerate(datas):
+            if type(data['wav']) is numpy.ndarray:
+                y, sample_rate = data['wav'], data['sample_rate']
+                data['wav'] = "None"  # NOTE(xcsong): Do not save wav.
+            elif type(data['wav'] is str):
+                y, sample_rate = librosa.load(data['wav'], sr=16000)
+            data['dur'] = len(y) / sample_rate
+            data['txt_length'] = len(data['txt'])
+            data['speed'] = data['txt_length'] / data['dur']
+            # Trim the beginning and ending silence
+            _, index = librosa.effects.trim(y, top_db=30)
+            data['leading_sil'] = librosa.get_duration(
+                y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0
+            data['trailing_sil'] = librosa.get_duration(
+                y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0
+            data_str = json.dumps(data, ensure_ascii=False)
+            f.write("{}\n".format(data_str))
+            if thread_id == 0 and i % 100 == 0:
+                logging.info("\tThread-{}: processed {}/{}".format(
+                    thread_id, i, len(datas)))
+
+
+def read_tar(file):
+    try:
+        with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream:
+            prev_prefix = None
+            data = {}
+            valid = True
+            for tarinfo in stream:
+                name = tarinfo.name
+                pos = name.rfind('.')
+                assert pos > 0
+                prefix, postfix = name[:pos], name[pos + 1:]
+                if prev_prefix is not None and prefix != prev_prefix:
+                    data['key'] = prev_prefix
+                    if valid:
+                        yield data
+                    data = {}
+                    valid = True
+                with stream.extractfile(tarinfo) as file_obj:
+                    try:
+                        if postfix == 'txt':
+                            data['txt'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix in AUDIO_FORMAT_SETS:
+                            waveform, sample_rate = torchaudio.load(
+                                file_obj)
+                            # single channel
+                            data['wav'] = waveform.numpy()[0, :]
+                            data['sample_rate'] = sample_rate
+                        else:
+                            data[postfix] = file_obj.read()
+                    except Exception as ex:
+                        valid = False
+                        logging.warning(
+                            'error: {} when parse {}'.format(ex, name))
+                prev_prefix = prefix
+            # The last data in tar
+            if prev_prefix is not None:
+                data['key'] = prev_prefix
+                yield data
+    except Exception as ex:
+        logging.warning(
+            'tar_file error: {} when processing {}'.format(ex, file))
+
+
+def main():
+    start_time = time.time()
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(args.output_dir + "/partition", exist_ok=True)
+    datas = [[] for i in range(args.num_thread)]
+
+    logging.info("Stage-1: Loading data.list OR wav.scp...")
+    if args.data_type == "shard":
+        assert args.data_list is not None
+        lists = read_lists(args.data_list)
+        # partition
+        total = 0
+        for line in lists:
+            for data in read_tar(line):
+                datas[total % args.num_thread].append(data)
+                total = total + 1
+    elif args.data_type == "raw":
+        assert args.data_list is not None
+        lists = read_lists(args.data_list)
+        # partition
+        for i, line in enumerate(lists):
+            data = json.loads(line)
+            datas[i % args.num_thread].append(data)
+    elif args.data_type == "wav_scp":
+        assert args.wav_scp is not None
+        assert args.text is not None
+        wavs, texts = {}, {}
+        # wavs
+        for line in read_lists(args.wav_scp):
+            line = line.strip().split()
+            wavs[line[0]] = line[1]
+        # texts
+        for line in read_lists(args.text):
+            line = line.strip().split(maxsplit=1)
+            texts[line[0]] = line[1]
+        sorted(wavs)
+        sorted(texts)
+        # partition
+        for i, (key1, key2) in enumerate(zip(wavs, texts)):
+            assert key1 == key2
+            datas[i % args.num_thread].append(
+                {'key': key1, "wav": wavs[key1], "txt": texts[key1]}
+            )
+
+    logging.info("Stage-2: Start Analyze")
+    # threads
+    pool = multiprocessing.Pool(processes=args.num_thread)
+    for i in range(args.num_thread):
+        output_file = os.path.join(
+            args.output_dir, "partition", "part-{}".format(i))
+        pool.apply_async(analyze, (datas[i], output_file, i))
+    pool.close()
+    pool.join()
+
+    logging.info("Stage-3: Sort and Write Result")
+    datas = []
+    for i in range(args.num_thread):
+        output_file = os.path.join(
+            args.output_dir, "partition", "part-{}".format(i))
+        with open(output_file, "r", encoding='utf8') as f:
+            for line in f.readlines():
+                data = json.loads(line)
+                datas.append(data)
+    total_dur = sum([x['dur'] for x in datas])
+    total_len = sum([x['txt_length'] for x in datas])
+    total_leading_sil = sum([x['leading_sil'] for x in datas])
+    total_trailing_sil = sum([x['trailing_sil'] for x in datas])
+    num_datas = len(datas)
+    names = ['key', 'dur', 'txt_length', 'speed',
+             'leading_sil', 'trailing_sil']
+    units = ['', 's', '', 'char/s', 'ms', 'ms']
+    avgs = [0, total_dur / num_datas, total_len / num_datas,
+            total_len / total_dur, total_leading_sil / num_datas,
+            total_trailing_sil / num_datas]
+    stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]),
+            sum([(x['txt_length'] - avgs[2])**2 for x in datas]),
+            sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]),
+            sum([(x['leading_sil'] - avgs[4])**2 for x in datas]),
+            sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])]
+    stds = [math.sqrt(x / num_datas) for x in stds]
+    parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min']
+    index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75),
+             int(num_datas * 0.50), int(num_datas * 0.25), 0]
+
+    with open(args.output_dir + "/analyze_result_brief",
+              "w", encoding='utf8') as f:
+        for i, (name, unit, avg, std) in enumerate(
+                zip(names, units, avgs, stds)):
+            if name == 'key':
+                continue
+            f.write("==================\n")
+
+            datas.sort(key=lambda x: x[name])
+            for p, j in zip(parts, index):
+                f.write("{} {}: {:.3f} {} (wav_id: {})\n".format(
+                    p, name, datas[j][name], unit, datas[j]['key']))
+            f.write("avg {}: {:.3f} {}\n".format(
+                name, avg, unit))
+            f.write("std {}: {:.3f}\n".format(
+                name, std))
+    os.system("cat {}".format(args.output_dir + "/analyze_result_brief"))
+
+    datas.sort(key=lambda x: x['dur'])
+    with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f:
+        for data in datas:
+            f.write("{}\n".format(json.dumps(data, ensure_ascii=False)))
+
+    end_time = time.time()
+    logging.info("Time Cost: {:.3f}s".format(end_time - start_time))
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/aishell/s0/tools/cmvn_kaldi2json.py
+++ b/examples/aishell/s0/tools/cmvn_kaldi2json.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import logging
+import sys
+import json
+
+def kaldi2json(kaldi_cmvn_file):
+    means = []
+    variance = []
+    with open(kaldi_cmvn_file, 'r') as fid:
+        # kaldi binary file start with '\0B'
+        if fid.read(2) == '\0B':
+            logging.error('kaldi cmvn binary file is not supported, please '
+                          'recompute it by: compute-cmvn-stats --binary=false '
+                          ' scp:feats.scp global_cmvn')
+            sys.exit(1)
+        fid.seek(0)
+        arr = fid.read().split()
+        assert (arr[0] == '[')
+        assert (arr[-2] == '0')
+        assert (arr[-1] == ']')
+        feat_dim = int((len(arr) - 2 - 2) / 2)
+        for i in range(1, feat_dim + 1):
+            means.append(float(arr[i]))
+        count = float(arr[feat_dim + 1])
+        for i in range(feat_dim + 2, 2 * feat_dim + 2):
+            variance.append(float(arr[i]))
+
+    cmvn_info = {'mean_stat:' : means,
+                 'var_stat' : variance,
+                 'frame_num' : count}
+    return cmvn_info
+
+if __name__ == '__main__':
+    with open(sys.argv[2], 'w') as fout:
+        cmvn = kaldi2json(sys.argv[1])
+        fout.write(json.dumps(cmvn))
--- a/examples/aishell/s0/tools/combine_data.sh
+++ b/examples/aishell/s0/tools/combine_data.sh
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#           2014  David Snyder
+
+# This script combines the data from multiple source directories into
+# a single destination directory.
+
+# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
+# about what these directories contain.
+
+# Begin configuration section.
+extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
+skip_fix=false # skip the fix_data_dir.sh in the end
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi
+
+if [ $# -lt 2 ]; then
+  echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
+  echo "Note, files that don't appear in all source dirs will not be combined,"
+  echo "with the exception of utt2uniq and segments, which are created where necessary."
+  exit 1
+fi
+
+dest=$1;
+shift;
+
+first_src=$1;
+
+rm -r $dest 2>/dev/null
+mkdir -p $dest;
+
+export LC_ALL=C
+
+for dir in $*; do
+  if [ ! -f $dir/utt2spk ]; then
+    echo "$0: no such file $dir/utt2spk"
+    exit 1;
+  fi
+done
+
+# Check that frame_shift are compatible, where present together with features.
+dir_with_frame_shift=
+for dir in $*; do
+  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
+    if [[ $dir_with_frame_shift ]] &&
+       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
+      echo "$0:error: different frame_shift in directories $dir and " \
+           "$dir_with_frame_shift. Cannot combine features."
+      exit 1;
+    fi
+    dir_with_frame_shift=$dir
+  fi
+done
+
+# W.r.t. utt2uniq file the script has different behavior compared to other files
+# it is not compulsary for it to exist in src directories, but if it exists in
+# even one it should exist in all. We will create the files where necessary
+has_utt2uniq=false
+for in_dir in $*; do
+  if [ -f $in_dir/utt2uniq ]; then
+    has_utt2uniq=true
+    break
+  fi
+done
+
+if $has_utt2uniq; then
+  # we are going to create an utt2uniq file in the destdir
+  for in_dir in $*; do
+    if [ ! -f $in_dir/utt2uniq ]; then
+      # we assume that utt2uniq is a one to one mapping
+      cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
+    else
+      cat $in_dir/utt2uniq
+    fi
+  done | sort -k1 > $dest/utt2uniq
+  echo "$0: combined utt2uniq"
+else
+  echo "$0 [info]: not combining utt2uniq as it does not exist"
+fi
+# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
+extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
+
+# segments are treated similarly to utt2uniq. If it exists in some, but not all
+# src directories, then we generate segments where necessary.
+has_segments=false
+for in_dir in $*; do
+  if [ -f $in_dir/segments ]; then
+    has_segments=true
+    break
+  fi
+done
+
+if $has_segments; then
+  for in_dir in $*; do
+    if [ ! -f $in_dir/segments ]; then
+      echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
+      utils/data/get_segments_for_data.sh $in_dir
+    else
+      cat $in_dir/segments
+    fi
+  done | sort -k1 > $dest/segments
+  echo "$0: combined segments"
+else
+  echo "$0 [info]: not combining segments as it does not exist"
+fi
+
+for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
+  exists_somewhere=false
+  absent_somewhere=false
+  for d in $*; do
+    if [ -f $d/$file ]; then
+      exists_somewhere=true
+    else
+      absent_somewhere=true
+      fi
+  done
+
+  if ! $absent_somewhere; then
+    set -o pipefail
+    ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
+    set +o pipefail
+    echo "$0: combined $file"
+  else
+    if ! $exists_somewhere; then
+      echo "$0 [info]: not combining $file as it does not exist"
+    else
+      echo "$0 [info]: **not combining $file as it does not exist everywhere**"
+    fi
+  fi
+done
+
+tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
+
+if [[ $dir_with_frame_shift ]]; then
+  cp $dir_with_frame_shift/frame_shift $dest
+fi
+
+if ! $skip_fix ; then
+  tools/fix_data_dir.sh $dest || exit 1;
+fi
+
+exit 0
--- a/examples/aishell/s0/tools/compute-cer.py
+++ b/examples/aishell/s0/tools/compute-cer.py
--- a/examples/aishell/s0/tools/compute-wer.py
+++ b/examples/aishell/s0/tools/compute-wer.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+import re, sys, unicodedata
+import codecs
+
+remove_tag = True
+spacelist= [' ', '\t', '\r', '\n']
+puncts = ['!', ',', '?',
+          '、', '。', '！', '，', '；', '？',
+          '：', '「', '」', '︰',  '『', '』', '《', '》']
+
+def characterize(string) :
+  res = []
+  i = 0
+  while i < len(string):
+    char = string[i]
+    if char in puncts:
+      i += 1
+      continue
+    cat1 = unicodedata.category(char)
+    #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+    if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
+       i += 1
+       continue
+    if cat1 == 'Lo': # letter-other
+       res.append(char)
+       i += 1
+    else:
+       # some input looks like: <unk><noise>, we want to separate it to two words.
+       sep = ' '
+       if char == '<': sep = '>'
+       j = i+1
+       while j < len(string):
+         c = string[j]
+         if ord(c) >= 128 or (c in spacelist) or (c==sep):
+           break
+         j += 1
+       if j < len(string) and string[j] == '>':
+         j += 1
+       res.append(string[i:j])
+       i = j
+  return res
+
+def stripoff_tags(x):
+  if not x: return ''
+  chars = []
+  i = 0; T=len(x)
+  while i < T:
+    if x[i] == '<':
+      while i < T and x[i] != '>':
+         i += 1
+      i += 1
+    else:
+      chars.append(x[i])
+      i += 1
+  return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+           x = x.upper()
+        if x in ignore_words:
+           continue
+        if remove_tag:
+          x = stripoff_tags(x)
+        if not x:
+          continue
+        if split and x in split:
+          new_sentence += split[x]
+        else:
+          new_sentence.append(x)
+    return new_sentence
+
+class Calculator :
+  def __init__(self) :
+    self.data = {}
+    self.space = []
+    self.cost = {}
+    self.cost['cor'] = 0
+    self.cost['sub'] = 1
+    self.cost['del'] = 1
+    self.cost['ins'] = 1
+  def calculate(self, lab, rec) :
+    # Initialization
+    lab.insert(0, '')
+    rec.insert(0, '')
+    while len(self.space) < len(lab) :
+      self.space.append([])
+    for row in self.space :
+      for element in row :
+        element['dist'] = 0
+        element['error'] = 'non'
+      while len(row) < len(rec) :
+        row.append({'dist' : 0, 'error' : 'non'})
+    for i in range(len(lab)) :
+      self.space[i][0]['dist'] = i
+      self.space[i][0]['error'] = 'del'
+    for j in range(len(rec)) :
+      self.space[0][j]['dist'] = j
+      self.space[0][j]['error'] = 'ins'
+    self.space[0][0]['error'] = 'non'
+    for token in lab :
+      if token not in self.data and len(token) > 0 :
+        self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
+    for token in rec :
+      if token not in self.data and len(token) > 0 :
+        self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
+    # Computing edit distance
+    for i, lab_token in enumerate(lab) :
+      for j, rec_token in enumerate(rec) :
+        if i == 0 or j == 0 :
+          continue
+        min_dist = sys.maxsize
+        min_error = 'none'
+        dist = self.space[i-1][j]['dist'] + self.cost['del']
+        error = 'del'
+        if dist < min_dist :
+          min_dist = dist
+          min_error = error
+        dist = self.space[i][j-1]['dist'] + self.cost['ins']
+        error = 'ins'
+        if dist < min_dist :
+          min_dist = dist
+          min_error = error
+        if lab_token == rec_token :
+          dist = self.space[i-1][j-1]['dist'] + self.cost['cor']
+          error = 'cor'
+        else :
+          dist = self.space[i-1][j-1]['dist'] + self.cost['sub']
+          error = 'sub'
+        if dist < min_dist :
+          min_dist = dist
+          min_error = error
+        self.space[i][j]['dist'] = min_dist
+        self.space[i][j]['error'] = min_error
+    # Tracing back
+    result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
+    i = len(lab) - 1
+    j = len(rec) - 1
+    while True :
+      if self.space[i][j]['error'] == 'cor' : # correct
+        if len(lab[i]) > 0 :
+          self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+          self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+          result['all'] = result['all'] + 1
+          result['cor'] = result['cor'] + 1
+        result['lab'].insert(0, lab[i])
+        result['rec'].insert(0, rec[j])
+        i = i - 1
+        j = j - 1
+      elif self.space[i][j]['error'] == 'sub' : # substitution
+        if len(lab[i]) > 0 :
+          self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+          self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+          result['all'] = result['all'] + 1
+          result['sub'] = result['sub'] + 1
+        result['lab'].insert(0, lab[i])
+        result['rec'].insert(0, rec[j])
+        i = i - 1
+        j = j - 1
+      elif self.space[i][j]['error'] == 'del' : # deletion
+        if len(lab[i]) > 0 :
+          self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+          self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+          result['all'] = result['all'] + 1
+          result['del'] = result['del'] + 1
+        result['lab'].insert(0, lab[i])
+        result['rec'].insert(0, "")
+        i = i - 1
+      elif self.space[i][j]['error'] == 'ins' : # insertion
+        if len(rec[j]) > 0 :
+          self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+          result['ins'] = result['ins'] + 1
+        result['lab'].insert(0, "")
+        result['rec'].insert(0, rec[j])
+        j = j - 1
+      elif self.space[i][j]['error'] == 'non' : # starting point
+        break
+      else : # shouldn't reach here
+        print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error']))
+    return result
+  def overall(self) :
+    result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
+    for token in self.data :
+      result['all'] = result['all'] + self.data[token]['all']
+      result['cor'] = result['cor'] + self.data[token]['cor']
+      result['sub'] = result['sub'] + self.data[token]['sub']
+      result['ins'] = result['ins'] + self.data[token]['ins']
+      result['del'] = result['del'] + self.data[token]['del']
+    return result
+  def cluster(self, data) :
+    result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
+    for token in data :
+      if token in self.data :
+        result['all'] = result['all'] + self.data[token]['all']
+        result['cor'] = result['cor'] + self.data[token]['cor']
+        result['sub'] = result['sub'] + self.data[token]['sub']
+        result['ins'] = result['ins'] + self.data[token]['ins']
+        result['del'] = result['del'] + self.data[token]['del']
+    return result
+  def keys(self) :
+      return list(self.data.keys())
+
+def width(string):
+  return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+def default_cluster(word) :
+  unicode_names = [ unicodedata.name(char) for char in word ]
+  for i in reversed(range(len(unicode_names))) :
+    if unicode_names[i].startswith('DIGIT') :  # 1
+      unicode_names[i] = 'Number'  # 'DIGIT'
+    elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+          unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) :
+      # 明 / 郎
+      unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+    elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+          unicode_names[i].startswith('LATIN SMALL LETTER')) :
+      # A / a
+      unicode_names[i] = 'English'  # 'LATIN LETTER'
+    elif unicode_names[i].startswith('HIRAGANA LETTER') :  # は こ め
+      unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+    elif (unicode_names[i].startswith('AMPERSAND') or
+          unicode_names[i].startswith('APOSTROPHE') or
+          unicode_names[i].startswith('COMMERCIAL AT') or
+          unicode_names[i].startswith('DEGREE CELSIUS') or
+          unicode_names[i].startswith('EQUALS SIGN') or
+          unicode_names[i].startswith('FULL STOP') or
+          unicode_names[i].startswith('HYPHEN-MINUS') or
+          unicode_names[i].startswith('LOW LINE') or
+          unicode_names[i].startswith('NUMBER SIGN') or
+          unicode_names[i].startswith('PLUS SIGN') or
+          unicode_names[i].startswith('SEMICOLON')) :
+      # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+      del unicode_names[i]
+    else :
+      return 'Other'
+  if len(unicode_names) == 0 :
+      return 'Other'
+  if len(unicode_names) == 1 :
+      return unicode_names[0]
+  for i in range(len(unicode_names)-1) :
+    if unicode_names[i] != unicode_names[i+1] :
+      return 'Other'
+  return unicode_names[0]
+
+def usage() :
+  print("compute-wer.py : compute word error rate (WER) and align recognition results and references.")
+  print("         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
+
+if __name__ == '__main__':
+  if len(sys.argv) == 1 :
+    usage()
+    sys.exit(0)
+  calculator = Calculator()
+  cluster_file = ''
+  ignore_words = set()
+  tochar = False
+  verbose= 1
+  padding_symbol= ' '
+  case_sensitive = False
+  max_words_per_line = sys.maxsize
+  split = None
+  while len(sys.argv) > 3:
+     a = '--maxw='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):]
+        del sys.argv[1]
+        max_words_per_line = int(b)
+        continue
+     a = '--rt='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):].lower()
+        del sys.argv[1]
+        remove_tag = (b == 'true') or (b != '0')
+        continue
+     a = '--cs='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):].lower()
+        del sys.argv[1]
+        case_sensitive = (b == 'true') or (b != '0')
+        continue
+     a = '--cluster='
+     if sys.argv[1].startswith(a):
+       cluster_file = sys.argv[1][len(a):]
+       del sys.argv[1]
+       continue
+     a = '--splitfile='
+     if sys.argv[1].startswith(a):
+       split_file = sys.argv[1][len(a):]
+       del sys.argv[1]
+       split = dict()
+       with codecs.open(split_file, 'r', 'utf-8') as fh:
+         for line in fh:  # line in unicode
+           words = line.strip().split()
+           if len(words) >= 2:
+             split[words[0]] = words[1:]
+       continue
+     a = '--ig='
+     if sys.argv[1].startswith(a):
+       ignore_file = sys.argv[1][len(a):]
+       del sys.argv[1]
+       with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+         for line in fh:  # line in unicode
+           line = line.strip()
+           if len(line) > 0:
+             ignore_words.add(line)
+       continue
+     a = '--char='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):].lower()
+        del sys.argv[1]
+        tochar = (b == 'true') or (b != '0')
+        continue
+     a = '--v='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):].lower()
+        del sys.argv[1]
+        verbose=0
+        try:
+          verbose=int(b)
+        except:
+           if b == 'true' or b != '0':
+              verbose = 1
+        continue
+     a = '--padding-symbol='
+     if sys.argv[1].startswith(a):
+        b = sys.argv[1][len(a):].lower()
+        del sys.argv[1]
+        if b == 'space':
+          padding_symbol= ' '
+        elif b == 'underline':
+          padding_symbol= '_'
+        continue
+     if True or sys.argv[1].startswith('-'):
+        #ignore invalid switch
+        del sys.argv[1]
+        continue
+
+  if not case_sensitive:
+     ig=set([w.upper() for w in ignore_words])
+     ignore_words = ig
+
+  default_clusters = {}
+  default_words = {}
+
+  ref_file = sys.argv[1]
+  hyp_file = sys.argv[2]
+  rec_set = {}
+  if split and not case_sensitive:
+     newsplit = dict()
+     for w in split:
+        words = split[w]
+        for i in range(len(words)):
+           words[i] = words[i].upper()
+        newsplit[w.upper()] = words
+     split = newsplit
+
+  with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+     for line in fh:
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.strip().split()
+        if len(array)==0: continue
+        fid = array[0]
+        rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+  # compute error rate on the interaction of reference file and hyp file
+  for line in open(ref_file, 'r', encoding='utf-8') :
+    if tochar:
+          array = characterize(line)
+    else:
+          array = line.rstrip('\n').split()
+    if len(array)==0: continue
+    fid = array[0]
+    if fid not in rec_set:
+       continue
+    lab = normalize(array[1:], ignore_words, case_sensitive, split)
+    rec = rec_set[fid]
+    if verbose:
+      print('\nutt: %s' % fid)
+
+    for word in rec + lab :
+      if word not in default_words :
+         default_cluster_name = default_cluster(word)
+         if default_cluster_name not in default_clusters :
+           default_clusters[default_cluster_name] = {}
+         if word not in default_clusters[default_cluster_name] :
+           default_clusters[default_cluster_name][word] = 1
+         default_words[word] = default_cluster_name
+
+    result = calculator.calculate(lab, rec)
+    if verbose:
+      if result['all'] != 0 :
+        wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+      else :
+        wer = 0.0
+      print('WER: %4.2f %%' % wer, end = ' ')
+      print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+      space = {}
+      space['lab'] = []
+      space['rec'] = []
+      for idx in range(len(result['lab'])) :
+        len_lab = width(result['lab'][idx])
+        len_rec = width(result['rec'][idx])
+        length = max(len_lab, len_rec)
+        space['lab'].append(length-len_lab)
+        space['rec'].append(length-len_rec)
+      upper_lab = len(result['lab'])
+      upper_rec = len(result['rec'])
+      lab1, rec1 = 0, 0
+      while lab1 < upper_lab or rec1 < upper_rec:
+         if verbose > 1:
+             print('lab(%s):' % fid.encode('utf-8'), end = ' ')
+         else:
+             print('lab:', end = ' ')
+         lab2 = min(upper_lab, lab1 + max_words_per_line)
+         for idx in range(lab1, lab2):
+           token = result['lab'][idx]
+           print('{token}'.format(token = token), end = '')
+           for n in range(space['lab'][idx]) :
+             print(padding_symbol, end = '')
+           print(' ',end='')
+         print()
+         if verbose > 1:
+            print('rec(%s):' % fid.encode('utf-8'), end = ' ')
+         else:
+            print('rec:', end = ' ')
+         rec2 = min(upper_rec, rec1 + max_words_per_line)
+         for idx in range(rec1, rec2):
+           token = result['rec'][idx]
+           print('{token}'.format(token = token), end = '')
+           for n in range(space['rec'][idx]) :
+             print(padding_symbol, end = '')
+           print(' ',end='')
+         print('\n', end='\n')
+         lab1 = lab2
+         rec1 = rec2
+
+  if verbose:
+    print('===========================================================================')
+    print()
+
+  result = calculator.overall()
+  if result['all'] != 0 :
+    wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+  else :
+    wer = 0.0
+  print('Overall -> %4.2f %%' % wer, end = ' ')
+  print('N=%d C=%d S=%d D=%d I=%d' %
+        (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+  if not verbose:
+     print()
+
+  if verbose:
+   for cluster_id in default_clusters :
+     result = calculator.cluster([ k for k in default_clusters[cluster_id] ])
+     if result['all'] != 0 :
+        wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+     else :
+        wer = 0.0
+     print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
+     print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+   if len(cluster_file) > 0 : # compute separated WERs for word clusters
+     cluster_id = ''
+     cluster = []
+     for line in open(cluster_file, 'r', encoding='utf-8') :
+       for token in line.decode('utf-8').rstrip('\n').split() :
+        # end of cluster reached, like </Keyword>
+        if token[0:2] == '</' and token[len(token)-1] == '>' and \
+           token.lstrip('</').rstrip('>') == cluster_id :
+          result = calculator.cluster(cluster)
+          if result['all'] != 0 :
+            wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
+          else :
+            wer = 0.0
+          print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ')
+          print('N=%d C=%d S=%d D=%d I=%d' %
+                (result['all'], result['cor'], result['sub'], result['del'], result['ins']))
+          cluster_id = ''
+          cluster = []
+        # begin of cluster reached, like <Keyword>
+        elif token[0] == '<' and token[len(token)-1] == '>' and \
+             cluster_id == '' :
+          cluster_id = token.lstrip('<').rstrip('>')
+          cluster = []
+        # general terms, like WEATHER / CAR / ...
+        else :
+          cluster.append(token)
+   print()
+   print('===========================================================================')
--- a/examples/aishell/s0/tools/compute_cmvn_stats.py
+++ b/examples/aishell/s0/tools/compute_cmvn_stats.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+import sys
+import argparse
+import json
+import codecs
+import yaml
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.utils.data import Dataset, DataLoader
+
+torchaudio.set_audio_backend("sox_io")
+
+
+class CollateFunc(object):
+    ''' Collate function for AudioDataset
+    '''
+
+    def __init__(self, feat_dim, resample_rate):
+        self.feat_dim = feat_dim
+        self.resample_rate = resample_rate
+        pass
+
+    def __call__(self, batch):
+        mean_stat = torch.zeros(self.feat_dim)
+        var_stat = torch.zeros(self.feat_dim)
+        number = 0
+        for item in batch:
+            value = item[1].strip().split(",")
+            assert len(value) == 3 or len(value) == 1
+            wav_path = value[0]
+            sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate
+            resample_rate = sample_rate
+            # len(value) == 3 means segmented wav.scp,
+            # len(value) == 1 means original wav.scp
+            if len(value) == 3:
+                start_frame = int(float(value[1]) * sample_rate)
+                end_frame = int(float(value[2]) * sample_rate)
+                waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
+                    filepath=wav_path,
+                    num_frames=end_frame - start_frame,
+                    frame_offset=start_frame)
+            else:
+                waveform, sample_rate = torchaudio.load(item[1])
+
+            waveform = waveform * (1 << 15)
+            if self.resample_rate != 0 and self.resample_rate != sample_rate:
+                resample_rate = self.resample_rate
+                waveform = torchaudio.transforms.Resample(
+                    orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+
+            mat = kaldi.fbank(waveform,
+                              num_mel_bins=self.feat_dim,
+                              dither=0.0,
+                              energy_floor=0.0,
+                              sample_frequency=resample_rate)
+            mean_stat += torch.sum(mat, axis=0)
+            var_stat += torch.sum(torch.square(mat), axis=0)
+            number += mat.shape[0]
+        return number, mean_stat, var_stat
+
+
+class AudioDataset(Dataset):
+    def __init__(self, data_file):
+        self.items = []
+        with codecs.open(data_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                arr = line.strip().split()
+                self.items.append((arr[0], arr[1]))
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, idx):
+        return self.items[idx]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='extract CMVN stats')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for processing')
+    parser.add_argument('--train_config',
+                        default='',
+                        help='training yaml conf')
+    parser.add_argument('--in_scp', default=None, help='wav scp file')
+    parser.add_argument('--out_cmvn',
+                        default='global_cmvn',
+                        help='global cmvn file')
+
+    doc = "Print log after every log_interval audios are processed."
+    parser.add_argument("--log_interval", type=int, default=1000, help=doc)
+    args = parser.parse_args()
+
+    with open(args.train_config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins']
+    resample_rate = 0
+    if 'resample_conf' in configs['dataset_conf']:
+        resample_rate = configs['dataset_conf']['resample_conf']['resample_rate']
+        print('using resample and new sample rate is {}'.format(resample_rate))
+
+    collate_func = CollateFunc(feat_dim, resample_rate)
+    dataset = AudioDataset(args.in_scp)
+    batch_size = 20
+    data_loader = DataLoader(dataset,
+                             batch_size=batch_size,
+                             shuffle=True,
+                             sampler=None,
+                             num_workers=args.num_workers,
+                             collate_fn=collate_func)
+
+    with torch.no_grad():
+        all_number = 0
+        all_mean_stat = torch.zeros(feat_dim)
+        all_var_stat = torch.zeros(feat_dim)
+        wav_number = 0
+        for i, batch in enumerate(data_loader):
+            number, mean_stat, var_stat = batch
+            all_mean_stat += mean_stat
+            all_var_stat += var_stat
+            all_number += number
+            wav_number += batch_size
+
+            if wav_number % args.log_interval == 0:
+                print(f'processed {wav_number} wavs, {all_number} frames',
+                      file=sys.stderr,
+                      flush=True)
+
+    cmvn_info = {
+        'mean_stat': list(all_mean_stat.tolist()),
+        'var_stat': list(all_var_stat.tolist()),
+        'frame_num': all_number
+    }
+
+    with open(args.out_cmvn, 'w') as fout:
+        fout.write(json.dumps(cmvn_info))
--- a/examples/aishell/s0/tools/compute_fbank_feats.py
+++ b/examples/aishell/s0/tools/compute_fbank_feats.py
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+
+import wenet.dataset.kaldi_io as kaldi_io
+
+# The "sox" backends are deprecated and will be removed in 0.9.0 release.
+# So here we use sox_io backend
+torchaudio.set_audio_backend("sox_io")
+
+
+def parse_opts():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--num_mel_bins',
+                        default=80,
+                        type=int,
+                        help='Number of triangular mel-frequency bins')
+    parser.add_argument('--frame_length',
+                        type=int,
+                        default=25,
+                        help='Frame length in milliseconds')
+    parser.add_argument('--frame_shift',
+                        type=int,
+                        default=10,
+                        help='Frame shift in milliseconds')
+    parser.add_argument('--dither',
+                        type=int,
+                        default=0.0,
+                        help='Dithering constant (0.0 means no dither)')
+    parser.add_argument('--segments', default=None, help='segments file')
+    parser.add_argument('wav_scp', help='wav scp file')
+    parser.add_argument('out_ark', help='output ark file')
+    parser.add_argument('out_scp', help='output scp file')
+    args = parser.parse_args()
+    return args
+
+
+# wav format: <key> <wav_path>
+def load_wav_scp(wav_scp_file):
+    wav_list = []
+    with open(wav_scp_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_list.append((arr[0], arr[1]))
+    return wav_list
+
+
+# wav format: <key> <wav_path>
+def load_wav_scp_dict(wav_scp_file):
+    wav_dict = {}
+    with open(wav_scp_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_dict[arr[0]] = arr[1]
+    return wav_dict
+
+
+# Segments format: <key> <wav_key> <start> <end>
+def load_wav_segments(wav_scp_file, segments_file):
+    wav_dict = load_wav_scp_dict(wav_scp_file)
+    audio_list = []
+    with open(segments_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 4
+            key = arr[0]
+            wav_file = wav_dict[arr[1]]
+            start = float(arr[2])
+            end = float(arr[3])
+            audio_list.append((key, wav_file, start, end))
+    return audio_list
+
+
+if __name__ == '__main__':
+    args = parse_opts()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    if args.segments is None:
+        audio_list = load_wav_scp(args.wav_scp)
+    else:
+        audio_list = load_wav_segments(args.wav_scp, args.segments)
+
+    count = 0
+    with open(args.out_ark, 'wb') as ark_fout, \
+         open(args.out_scp, 'w', encoding='utf8') as scp_fout:
+        for item in audio_list:
+            if len(item) == 2:
+                key, wav_path = item
+                waveform, sample_rate = torchaudio.load_wav(wav_path)
+            else:
+                assert len(item) == 4
+                key, wav_path, start, end = item
+                sample_rate = torchaudio.info(wav_path).sample_rate
+                frame_offset = int(start * sample_rate)
+                num_frames = int((end - start) * sample_rate)
+                waveform, sample_rate = torchaudio.load_wav(
+                    wav_path, frame_offset, num_frames)
+
+            mat = kaldi.fbank(waveform,
+                              num_mel_bins=args.num_mel_bins,
+                              frame_length=args.frame_length,
+                              frame_shift=args.frame_shift,
+                              dither=args.dither,
+                              energy_floor=0.0,
+                              sample_frequency=sample_rate)
+            mat = mat.detach().numpy()
+            kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout)
+            count += 1
+            if count % 10000 == 0:
+                logging.info('Progress {}/{}'.format(count, len(audio_list)))
--- a/examples/aishell/s0/tools/copy_data_dir.sh
+++ b/examples/aishell/s0/tools/copy_data_dir.sh
+#!/bin/bash
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  feats.scp
+#  wav.scp
+#  vad.scp
+#  spk2utt
+#  utt2spk
+#  text
+#
+# It copies to another directory, possibly adding a specified prefix or a suffix
+# to the utterance and/or speaker names.  Note, the recording-ids stay the same.
+#
+
+
+# begin configuration section
+spk_prefix=
+utt_prefix=
+spk_suffix=
+utt_suffix=
+validate_opts=   # should rarely be needed.
+# end configuration section
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
+  echo "Options"
+  echo "   --spk-prefix=<prefix>     # Prefix for speaker ids, default empty"
+  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
+  echo "   --spk-suffix=<suffix>     # Suffix for speaker ids, default empty"
+  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
+  exit 1;
+fi
+
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
+
+if [ ! -f $srcdir/utt2uniq ]; then
+  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
+    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
+  fi
+else
+  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
+fi
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+if [ -f $srcdir/feats.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
+fi
+
+if [ -f $srcdir/vad.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
+fi
+
+if [ -f $srcdir/segments ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
+  cp $srcdir/wav.scp $destdir
+else # no segments->wav indexed by utt.
+  if [ -f $srcdir/wav.scp ]; then
+    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
+  fi
+fi
+
+if [ -f $srcdir/reco2file_and_channel ]; then
+  cp $srcdir/reco2file_and_channel $destdir/
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/utt2dur ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
+if [ -f $srcdir/utt2num_frames ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
+fi
+if [ -f $srcdir/reco2dur ]; then
+  if [ -f $srcdir/segments ]; then
+    cp $srcdir/reco2dur $destdir/reco2dur
+  else
+    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
+  fi
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+if [ -f $srcdir/cmvn.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
+fi
+for f in frame_shift stm glm ctm; do
+  if [ -f $srcdir/$f ]; then
+    cp $srcdir/$f $destdir
+  fi
+done
+
+rm $destdir/spk_map $destdir/utt_map
+
+echo "$0: copied data from $srcdir to $destdir"
+
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
+  if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
+    echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
+    echo " ... $destdir/.backup/$f"
+    mkdir -p $destdir/.backup
+    mv $destdir/$f $destdir/.backup/
+  fi
+done
+
+
+[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
+[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
+
+echo $validate_opts
+echo $destdir
+utils/validate_data_dir.sh $validate_opts $destdir
--- a/examples/aishell/s0/tools/data/remove_dup_utts.sh
+++ b/examples/aishell/s0/tools/data/remove_dup_utts.sh
+#!/usr/bin/env bash
+
+# Script taken from kaldi repo:
+# https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/utils/data/remove_dup_utts.sh
+
+# Remove excess utterances once they appear  more than a specified
+# number of times with the same transcription, in a data set.
+# E.g. useful for removing excess "uh-huh" from training.
+
+if [ $# != 3 ]; then
+  echo "Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>"
+  echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
+  echo "This script is used to filter out utterances that have from over-represented"
+  echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
+  echo "any given word-sequence to a specified value.  It's often used to get"
+  echo "subsets for early stages of training."
+  exit 1;
+fi
+
+maxcount=$1
+srcdir=$2
+destdir=$3
+mkdir -p $destdir
+
+[ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1;
+
+! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1;
+
+! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1;
+
+cp $srcdir/* $destdir
+cat $srcdir/text | \
+  perl -e '
+  $maxcount = shift @ARGV;
+  @all = ();
+   $p1 = 103349; $p2 = 71147; $k = 0;
+   sub random { # our own random number generator: predictable.
+     $k = ($k + $p1) % $p2;
+     return ($k / $p2);
+  }
+  while(<>) {
+    push @all, $_;
+    @A = split(" ", $_);
+    shift @A;
+    $text = join(" ", @A);
+    $count{$text} ++;
+  }
+  foreach $line (@all) {
+    @A = split(" ", $line);
+    shift @A;
+    $text = join(" ", @A);
+    $n = $count{$text};
+    if ($n < $maxcount || random() < ($maxcount / $n)) {
+      print $line;
+    }
+  }'  $maxcount >$destdir/text
+
+echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
+
+# Not doing these steps as this script doesn't exist
+# + the calling script already validates data
+#echo "Using fix_data_dir.sh to reconcile the other files."
+#utils/fix_data_dir.sh $destdir
+#rm -r $destdir/.backup
+
+exit 0
--- a/examples/aishell/s0/tools/data/split_scp.pl
+++ b/examples/aishell/s0/tools/data/split_scp.pl
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries.  In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+    }
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+        $utt2spk_file=$1;
+        shift;
+    }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+    and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) {
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+}
+
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
+        @A = split;
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+    @spkrs = ();
+    while(<$i_fh>) {
+        @A = split;
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+        if(!defined $spk_count{$s}) {
+            push @spkrs, $s;
+            $spk_count{$s} = 0;
+            $spk_data{$s} = [];  # ref to new empty array.
+        }
+        $spk_count{$s}++;
+        push @{$spk_data{$s}}, $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    if ($numspks < $numscps) {
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
+    }
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
+            $error = 1;
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print $f_fh @{$spk_data{$spk}};
+                $count += $spk_count{$spk};
+            }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+        }
+        close($f_fh);
+    }
+} else {
+   # This block is the "normal" case where there is no --utt2spk
+   # option and we just break into equal size chunks.
+
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<$i_fh>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "$0: error: empty input scp file $inscp\n";
+        $error = 1;
+    }
+    $linesperscp = int( $numlines / $numscps); # the "whole part"..
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+    $remainder = $numlines - ($linesperscp * $numscps);
+    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+    # [just doing int() rounds down].
+    $n = 0;
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+            print $o_fh $F[$n++];
+        }
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+    }
+    $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
--- a/examples/aishell/s0/tools/decode.sh
+++ b/examples/aishell/s0/tools/decode.sh
+#!/usr/bin/env bash
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# Author: binbinzhang@mobvoi.com (Binbin Zhang)
+export GLOG_logtostderr=1
+export GLOG_v=2
+
+set -e
+
+nj=1
+chunk_size=-1
+ctc_weight=0.0
+reverse_weight=0.0
+rescoring_weight=1.0
+# For CTC WFST based decoding
+fst_path=
+dict_path=
+acoustic_scale=1.0
+beam=15.0
+lattice_beam=12.0
+min_active=200
+max_active=7000
+blank_skip_thresh=1.0
+length_penalty=0.0
+
+. tools/parse_options.sh || exit 1;
+if [ $# != 5 ]; then
+  echo "Usage: $0 [options] <wav.scp> <label_file> <model_file> <unit_file> <output_dir>"
+  exit 1;
+fi
+
+if ! which decoder_main > /dev/null; then
+  echo "decoder_main is not built, please go to runtime/libtorch to build it."
+  exit 1;
+fi
+
+scp=$1
+label_file=$2
+model_file=$3
+unit_file=$4
+dir=$5
+
+mkdir -p $dir/split${nj}
+
+# Step 1. Split wav.scp
+split_scps=""
+for n in $(seq ${nj}); do
+  split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp"
+done
+tools/data/split_scp.pl ${scp} ${split_scps}
+
+# Step 2. Parallel decoding
+wfst_decode_opts=
+if [ ! -z $fst_path ]; then
+  wfst_decode_opts="--fst_path $fst_path"
+  wfst_decode_opts="$wfst_decode_opts --beam $beam"
+  wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path"
+  wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam"
+  wfst_decode_opts="$wfst_decode_opts --max_active $max_active"
+  wfst_decode_opts="$wfst_decode_opts --min_active $min_active"
+  wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale"
+  wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh"
+  wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty"
+  echo $wfst_decode_opts > $dir/config
+fi
+for n in $(seq ${nj}); do
+{
+  decoder_main \
+     --rescoring_weight $rescoring_weight \
+     --ctc_weight $ctc_weight \
+     --reverse_weight $reverse_weight \
+     --chunk_size $chunk_size \
+     --wav_scp ${dir}/split${nj}/wav.${n}.scp \
+     --model_path $model_file \
+     --unit_path $unit_file \
+     $wfst_decode_opts \
+     --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log
+} &
+done
+wait
+
+# Step 3. Merge files
+for n in $(seq ${nj}); do
+  cat ${dir}/split${nj}/${n}.text
+done > ${dir}/text
+tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf
+
+# Step 4. Compute WER
+python3 tools/compute-wer.py --char=1 --v=1 \
+  $label_file $dir/text > $dir/wer
--- a/examples/aishell/s0/tools/feat_to_shape.sh
+++ b/examples/aishell/s0/tools/feat_to_shape.sh
+#!/bin/bash
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+verbose=0
+filetype=""
+preprocess_conf=""
+# End configuration section.
+
+help_message=$(cat << EOF
+Usage: $0 [options] <input-scp> <output-scp> [<log-dir>]
+e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log
+Options:
+  --nj <nj>                                        # number of parallel jobs
+  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
+  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
+  --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
+  --verbose <num>                                  # Default: 0
+EOF
+)
+
+echo "$0 $*" 1>&2 # Print the command line for logging
+
+. parse_options.sh || exit 1;
+
+if [ $# -lt 2 ] || [ $# -gt 3 ]; then
+    echo "${help_message}" 1>&2
+    exit 1;
+fi
+
+set -euo pipefail
+
+scp=$1
+outscp=$2
+data=$(dirname ${scp})
+if [ $# -eq 3 ]; then
+  logdir=$3
+else
+  logdir=${data}/log
+fi
+mkdir -p ${logdir}
+
+split_scps=""
+for n in $(seq ${nj}); do
+    split_scps="${split_scps} ${logdir}/feats.${n}.scp"
+done
+
+utils/split_scp.pl ${scp} ${split_scps}
+
+if [ -n "${preprocess_conf}" ]; then
+    preprocess_opt="--preprocess-conf ${preprocess_conf}"
+else
+    preprocess_opt=""
+fi
+if [ -n "${filetype}" ]; then
+    filetype_opt="--filetype ${filetype}"
+else
+    filetype_opt=""
+fi
+
+${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
+    feat-to-len --verbose=${verbose} \
+    scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp
+
+feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -)
+
+# concatenate the .scp files together.
+for n in $(seq ${nj}); do
+    sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp
+done > ${outscp}
+
+rm -f ${logdir}/feats.*.scp 2>/dev/null
--- a/examples/aishell/s0/tools/filter_scp.pl
+++ b/examples/aishell/s0/tools/filter_scp.pl
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/examples/aishell/s0/tools/fix_data_dir.sh
+++ b/examples/aishell/s0/tools/fix_data_dir.sh
+#!/bin/bash
+
+# This script makes sure that only the segments present in
+# all of "feats.scp", "wav.scp" [if present], segments [if present]
+# text, and utt2spk are present in any of them.
+# It puts the original contents of data-dir into
+# data-dir/.backup
+
+cmd="$@"
+
+utt_extra_files=
+spk_extra_files=
+
+. tools/parse_options.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: utils/data/fix_data_dir.sh <data-dir>"
+  echo "e.g.: utils/data/fix_data_dir.sh data/train"
+  echo "This script helps ensure that the various files in a data directory"
+  echo "are correctly sorted and filtered, for example removing utterances"
+  echo "that have no features (if feats.scp is present)"
+  exit 1
+fi
+
+data=$1
+
+if [ -f $data/images.scp ]; then
+  image/fix_data_dir.sh $cmd
+  exit $?
+fi
+
+mkdir -p $data/.backup
+
+[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
+
+[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
+
+set -e -o pipefail -u
+
+tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
+trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
+
+export LC_ALL=C
+
+function check_sorted {
+  file=$1
+  sort -k1,1 -u <$file >$file.tmp
+  if ! cmp -s $file $file.tmp; then
+    echo "$0: file $1 is not in sorted order or not unique, sorting it"
+    mv $file.tmp $file
+  else
+    rm $file.tmp
+  fi
+}
+
+for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
+    reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do
+  if [ -f $data/$x ]; then
+    cp $data/$x $data/.backup/$x
+    check_sorted $data/$x
+  fi
+done
+
+
+function filter_file {
+  filter=$1
+  file_to_filter=$2
+  cp $file_to_filter ${file_to_filter}.tmp
+  tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
+  if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
+    length1=$(cat ${file_to_filter}.tmp | wc -l)
+    length2=$(cat ${file_to_filter} | wc -l)
+    if [ $length1 -ne $length2 ]; then
+      echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
+    fi
+  fi
+  rm $file_to_filter.tmp
+}
+
+function filter_recordings {
+  # We call this once before the stage when we filter on utterance-id, and once
+  # after.
+
+  if [ -f $data/segments ]; then
+  # We have a segments file -> we need to filter this and the file wav.scp, and
+  # reco2file_and_utt, if it exists, to make sure they have the same list of
+  # recording-ids.
+
+    if [ ! -f $data/wav.scp ]; then
+      echo "$0: $data/segments exists but not $data/wav.scp"
+      exit 1;
+    fi
+    awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
+    n1=$(cat $tmpdir/recordings | wc -l)
+    [ ! -s $tmpdir/recordings ] && \
+      echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
+    tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
+    mv $tmpdir/recordings.tmp $tmpdir/recordings
+
+
+    cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
+    filter_file $tmpdir/recordings $data/segments
+    cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
+    rm $data/segments.tmp
+
+    filter_file $tmpdir/recordings $data/wav.scp
+    [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
+    [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur
+    true
+  fi
+}
+
+function filter_speakers {
+  # throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
+  tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
+
+  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
+  for s in cmvn.scp spk2gender; do
+    f=$data/$s
+    if [ -f $f ]; then
+      filter_file $f $tmpdir/speakers
+    fi
+  done
+
+  filter_file $tmpdir/speakers $data/spk2utt
+  tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
+
+  for s in cmvn.scp spk2gender $spk_extra_files; do
+    f=$data/$s
+    if [ -f $f ]; then
+      filter_file $tmpdir/speakers $f
+    fi
+  done
+}
+
+function filter_utts {
+  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
+  echo "$(cat $tmpdir/utts | wc -l)"
+  ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
+    echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
+
+  ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
+    echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
+    echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
+
+  ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
+    echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
+
+  if [ -f $data/utt2uniq ]; then
+    ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
+      echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
+  fi
+
+  maybe_wav=
+  maybe_reco2dur=
+  [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
+  [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
+
+  maybe_utt2dur=
+  if [ -f $data/utt2dur ]; then
+    cat $data/utt2dur | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
+    maybe_utt2dur=utt2dur.ok
+  fi
+
+  maybe_utt2num_frames=
+  if [ -f $data/utt2num_frames ]; then
+    cat $data/utt2num_frames | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1
+    maybe_utt2num_frames=utt2num_frames.ok
+  fi
+
+  for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do
+    if [ -f $data/$x ]; then
+      tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
+      echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)"
+      mv $tmpdir/utts.tmp $tmpdir/utts
+      # echo "$tmpdir/utts"
+    fi
+  done
+  rm $data/utt2dur.ok 2>/dev/null || true
+  rm $data/utt2num_frames.ok 2>/dev/null || true
+
+  [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
+    rm $tmpdir/utts && exit 1;
+
+
+  if [ -f $data/utt2spk ]; then
+    new_nutts=$(cat $tmpdir/utts | wc -l)
+    old_nutts=$(cat $data/utt2spk | wc -l)
+    if [ $new_nutts -ne $old_nutts ]; then
+      echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
+    else
+      echo "fix_data_dir.sh: kept all $old_nutts utterances."
+    fi
+  fi
+
+  for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do
+    if [ -f $data/$x ]; then
+      cp $data/$x $data/.backup/$x
+      if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then
+        tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
+      fi
+    fi
+  done
+
+}
+
+filter_recordings
+filter_speakers
+filter_utts
+filter_speakers
+filter_recordings
+
+tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
+
+echo "fix_data_dir.sh: old files are kept in $data/.backup"
--- a/examples/aishell/s0/tools/flake8_hook.py
+++ b/examples/aishell/s0/tools/flake8_hook.py
+#!/usr/bin/env python3
+# encoding: utf-8
+import sys
+
+from flake8.main import git
+
+if __name__ == '__main__':
+    sys.exit(
+        git.hook(
+            strict=True,
+            lazy=git.config_for('lazy'),
+        )
+    )
--- a/examples/aishell/s0/tools/format_data.sh
+++ b/examples/aishell/s0/tools/format_data.sh
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#                Mobvoi Corporation (Author: Di Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+echo "$0 $*" >&2 # Print the command line for logging
+. ./path.sh
+
+nj=1
+cmd=run.pl
+nlsyms=""
+lang=""
+feat=""
+feat_type="kaldi"
+oov="<unk>"
+bpecode=""
+allow_one_column=false
+raw=""
+verbose=0
+trans_type=char
+filetype=""
+preprocess_conf=""
+category=""
+out="" # If omitted, write in stdout
+help_message=$(cat << EOF
+Usage: $0 <data-dir> <dict>
+e.g. $0 data/train data/lang_1char/train_units.txt
+Options:
+  --nj <nj>                                        # number of parallel jobs
+  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
+  --feat <feat-scp>                                # feat.scp or feat1.scp,feat2.scp,...
+  --feat-type <feat-type>                          # kaldi or wav
+  --oov <oov-word>                                 # Default: <unk>
+  --out <outputfile>                               # If omitted, write in stdout
+  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
+  --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
+  --verbose <num>                                  # Default: 0
+EOF
+)
+. tools/parse_options.sh
+
+if [ $# != 2 ]; then
+    echo "${help_message}" 1>&2
+    exit 1;
+fi
+
+set -euo pipefail
+
+dir=$1
+dic=$2
+tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
+#trap 'rm -rf ${tmpdir}' EXIT
+
+# 1. Create scp files for inputs
+#   These are not necessary for decoding mode, and make it as an option
+input=
+if [ -n "${feat}" ]; then
+    _feat_scps=$(echo "${feat}" | tr ',' ' ' )
+    read -r -a feat_scps <<< $_feat_scps
+    num_feats=${#feat_scps[@]}
+
+    for (( i=1; i<=num_feats; i++ )); do
+        feat=${feat_scps[$((i-1))]}
+        mkdir -p ${tmpdir}/input_${i}
+        input+="input_${i} "
+        cat ${feat} > ${tmpdir}/input_${i}/feat.scp
+
+        # Dump in the "legacy" style JSON format
+        if [ -n "${filetype}" ]; then
+            awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
+                > ${tmpdir}/input_${i}/filetype.scp
+        fi
+
+        if [ ${feat_type} == "kaldi" ]; then
+            tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
+                --filetype "${filetype}" \
+                --preprocess-conf "${preprocess_conf}" \
+                --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
+        elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then
+            if [ -f $dir/segments ]; then
+                # used for segmented wav.scp
+                awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur
+            fi
+            if [ ! -f $dir/utt2dur ]; then
+                tools/wav_to_duration.sh --nj ${nj} \
+                    ${feat} ${tmpdir}/input_${i}/shape.scp
+            # use the existed utt2dur as shape.scp directly
+            else
+                cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp
+            fi
+        fi
+    done
+fi
+
+# 2. Create scp files for outputs
+mkdir -p ${tmpdir}/output
+if [ -n "${bpecode}" ]; then
+    if [ "${trans_type}" == "cn_char_en_bpe" ]; then
+        tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
+    else
+        paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \
+            | tools/spm_encode --model=${bpecode} --output_format=piece) \
+            > ${tmpdir}/output/token.scp
+    fi
+elif [ -n "${nlsyms}" ]; then
+    tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
+elif [ -n "${raw}" ]; then
+    cat $dir/text > ${tmpdir}/output/token.scp
+else
+    tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
+fi
+< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
+odim=$(cat ${dic} | wc -l)
+< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
+
+cat ${dir}/text > ${tmpdir}/output/text.scp
+
+# 3. Create scp files for the others
+mkdir -p ${tmpdir}/other
+if [ -n "${lang}" ]; then
+    awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp
+fi
+
+if [ -n "${category}" ]; then
+    awk -v category=${category} '{print $1 " " category}' ${dir}/text \
+        > ${tmpdir}/other/category.scp
+fi
+#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
+
+# 4. Merge scp files into a one file
+opts=""
+for intype in ${input} output other; do
+    if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
+        continue
+    fi
+
+    if [ ${intype} != other ]; then
+        opts+="--${intype%_*}-scps "
+    else
+        opts+="--scps "
+    fi
+
+    for x in "${tmpdir}/${intype}"/*.scp; do
+        k=$(basename ${x} .scp)
+        if [ ${k} = shape ]; then
+            opts+="shape:${x}:shape "
+        else
+            opts+="${k}:${x} "
+        fi
+    done
+done
+
+if ${allow_one_column}; then
+    opts+="--allow-one-column true "
+else
+    opts+="--allow-one-column false "
+fi
+
+if [ -n "${out}" ]; then
+    opts+="-O ${out}"
+fi
+
+tools/merge_scp2txt.py --verbose ${verbose} ${opts}
+
+#rm -fr ${tmpdir}
--- a/examples/aishell/s0/tools/fst/add_lex_disambig.pl
+++ b/examples/aishell/s0/tools/fst/add_lex_disambig.pl
+#!/usr/bin/env perl
+#  Copyright 2010-2011  Microsoft Corporation
+#            2013-2016  Johns Hopkins University (author: Daniel Povey)
+#                 2015  Hainan Xu
+#                 2015  Guoguo Chen
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds disambiguation symbols to a lexicon.
+# Outputs still in the normal lexicon format.
+# Disambig syms are numbered #1, #2, #3, etc. (#0
+# reserved for symbol in grammar).
+# Outputs the number of disambig syms to the standard output.
+# With the --pron-probs option, expects the second field
+# of each lexicon line to be a pron-prob.
+# With the --sil-probs option, expects three additional
+# fields after the pron-prob, representing various components
+# of the silence probability model.
+
+$pron_probs = 0;
+$sil_probs = 0;
+$first_allowed_disambig = 1;
+
+for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
+  if ($ARGV[0] eq "--pron-probs") {
+    $pron_probs = 1;
+    shift @ARGV;
+  }
+  if ($ARGV[0] eq "--sil-probs") {
+    $sil_probs = 1;
+    shift @ARGV;
+  }
+  if ($ARGV[0] eq "--first-allowed-disambig") {
+    $first_allowed_disambig = 0 + $ARGV[1];
+    if ($first_allowed_disambig < 1) {
+      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
+    }
+    shift @ARGV;
+    shift @ARGV;
+  }
+}
+
+if (@ARGV != 2) {
+  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
+    "This script adds disambiguation symbols to a lexicon in order to\n" .
+    "make decoding graphs determinizable; it adds pseudo-phone\n" .
+    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
+    "to ensure that all pronunciations are different, and that none\n" .
+    "is a prefix of another.\n" .
+    "It prints to the standard output the number of the largest-numbered" .
+    "disambiguation symbol that was used.\n" .
+    "\n" .
+    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
+    "           --sil-probs        [should be with --pron-probs option]\n" .
+    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
+    "                              the silence probability model\n" .
+    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
+    "                              that this script is allowed to add.  By default this is\n" .
+    "                              #1, but you can set this to a larger value using this option.\n" .
+    "e.g.:\n" .
+    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
+    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
+    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
+}
+
+
+$lexfn = shift @ARGV;
+$lexoutfn = shift @ARGV;
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+# (1)  Read in the lexicon.
+@L = ( );
+while(<L>) {
+    @A = split(" ", $_);
+    push @L, join(" ", @A);
+}
+
+# (2) Work out the count of each phone-sequence in the
+# lexicon.
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    if ($pron_probs) {
+      $p = shift @A;
+      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
+    }
+    if ($sil_probs) {
+      $silp = shift @A;
+      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
+      $correction = shift @A;
+      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
+      $correction = shift @A;
+      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
+    }
+    if (!(@A)) {
+      die "Bad lexicon line $1, no phone in phone list";
+    }
+    $count{join(" ",@A)}++;
+}
+
+# (3) For each left sub-sequence of each phone-sequence, note down
+# that it exists (for identifying prefixes of longer strings).
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    if ($pron_probs) { shift @A; } # remove pron-prob.
+    if ($sil_probs) {
+      shift @A; # Remove silprob
+      shift @A; # Remove silprob
+    }
+    while(@A > 0) {
+        pop @A;  # Remove last phone
+        $issubseq{join(" ",@A)} = 1;
+    }
+}
+
+# (4) For each entry in the lexicon:
+#  if the phone sequence is unique and is not a
+#  prefix of another word, no diambig symbol.
+#  Else output #1, or #2, #3, ... if the same phone-seq
+#  has already been assigned a disambig symbol.
+
+
+open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
+
+# max_disambig will always be the highest-numbered disambiguation symbol that
+# has been used so far.
+$max_disambig = $first_allowed_disambig - 1;
+
+foreach $l (@L) {
+  @A = split(" ", $l);
+  $word = shift @A;
+  if ($pron_probs) {
+    $pron_prob = shift @A;
+  }
+  if ($sil_probs) {
+    $sil_word_prob = shift @A;
+    $word_sil_correction = shift @A;
+    $prev_nonsil_correction = shift @A
+  }
+  $phnseq = join(" ", @A);
+  if (!defined $issubseq{$phnseq}
+      && $count{$phnseq} == 1) {
+    ;                           # Do nothing.
+  } else {
+    if ($phnseq eq "") {        # need disambig symbols for the empty string
+      # that are not use anywhere else.
+      $max_disambig++;
+      $reserved_for_the_empty_string{$max_disambig} = 1;
+      $phnseq = "#$max_disambig";
+    } else {
+      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
+      if (!defined $cur_disambig) {
+        $cur_disambig = $first_allowed_disambig;
+      } else {
+        $cur_disambig++;           # Get a number that has not been used yet for
+                                   # this phone sequence.
+      }
+      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
+        $cur_disambig++;
+      }
+      if ($cur_disambig > $max_disambig) {
+        $max_disambig = $cur_disambig;
+      }
+      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
+      $phnseq = $phnseq . " #" . $cur_disambig;
+    }
+  }
+  if ($pron_probs) {
+    if ($sil_probs) {
+      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
+    } else {
+      print O "$word\t$pron_prob\t$phnseq\n";
+    }
+  } else {
+    print O "$word\t$phnseq\n";
+  }
+}
+
+print $max_disambig . "\n";
--- a/examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh
+++ b/examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh
+#!/bin/bash
+# Copyright 2015       Yajie Miao    (Carnegie Mellon University)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
+# phoneme and character-based lexicons.
+set -eo pipefail
+. tools/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "usage: tools/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
+  echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
+  echo "<dict-src-dir> should contain the following files:"
+  echo "lexicon.txt units.txt"
+  echo "options: "
+  exit 1;
+fi
+
+srcdir=$1
+tmpdir=$2
+dir=$3
+mkdir -p $dir $tmpdir
+
+[ -f path.sh ] && . ./path.sh
+
+export LC_ALL=C
+
+cp $srcdir/units.txt $dir
+
+# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
+# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
+perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
+
+# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
+# Without these symbols, determinization will fail.
+ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
+ndisambig=$[$ndisambig+1];
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
+
+# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
+# the actual model unit, and the disambiguation symbols.
+cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
+(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
+
+# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
+# so here use ctc_token_fst_compact
+tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \
+  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
+  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
+
+# Encode the words with indices. Will be used in lexicon and language model FST compiling.
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
+token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
+word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
+
+tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
+  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+  --keep_isymbols=false --keep_osymbols=false |   \
+  fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
+  fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+
+echo "Lexicon and token FSTs compiling succeeded"