add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/aishell2/rnnt/path.sh
+++ b/examples/aishell2/rnnt/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/aishell2/rnnt/run.sh
+++ b/examples/aishell2/rnnt/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#           2022 burkliu(boji123@aliyun.com)
+
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+# modify this to your AISHELL-2 data path
+# Note: the evaluation data (dev & test) is available at AISHELL.
+# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip
+train_set=/cfs/share/corpus/aishell-2/AISHELL-2/iOS/data
+dev_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/dev
+test_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/test
+
+nj=16
+dict=data/dict/lang_char.txt
+
+train_set=train
+train_config=conf/conformer_u2pp_rnnt.yaml
+cmvn=true
+dir=exp/`basename ${train_config%.*}`
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="rnnt_beam_search"
+
+# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+# -1 for full chunk
+decoding_chunk_size=-1
+# only used in rescore mode for weighting different scores
+rescore_ctc_weight=0.5
+rescore_transducer_weight=0.5
+rescore_attn_weight=0.5
+# only used in beam search, either pure beam search mode OR beam search inside rescoring
+search_ctc_weight=0.3
+search_transducer_weight=0.7
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Data preparation
+    local/prepare_data.sh ${train_set} data/local/${train_set} data/${train_set} || exit 1;
+    local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1;
+    local/prepare_data.sh ${test_set} data/local/test data/test || exit 1;
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # remove the space between the text labels for Mandarin dataset
+    for x in ${train_set} dev test; do
+        cp data/${x}/text data/${x}/text.org
+        paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \
+             | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
+            > data/${x}/text
+        rm data/${x}/text.org
+    done
+
+    tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+        --in_scp data/${train_set}/wav.scp \
+        --out_cmvn data/$train_set/global_cmvn
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # Make train dict
+    echo "Make a dictionary"
+    mkdir -p $(dirname $dict)
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+    tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # Prepare wenet required data
+    echo "Prepare data, prepare required format"
+    for x in dev test ${train_set}; do
+        tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list
+    done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Training
+    mkdir -p $dir
+    INIT_FILE=$dir/ddp_init
+    # You had better rm it manually before you start run.sh on first node.
+    # rm -f $INIT_FILE # delete old one before starting
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    # The number of gpus runing on each node/machine
+    num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+    # Use "nccl" if it works, otherwise use "gloo"
+    dist_backend="gloo"
+    #dist_backend="nccl"
+    # The total number of processes/gpus, so that the master knows
+    # how many workers to wait for.
+    # More details about ddp can be found in
+    # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+    world_size=`expr $num_gpus \* $num_nodes`
+    echo "total gpus is: $world_size"
+    cmvn_opts=
+    $cmvn && cp data/${train_set}/global_cmvn $dir
+    $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+    # train.py will write $train_config to $dir/train.yaml with model input
+    # and output dimension, train.yaml will be used for inference or model
+    # export later
+    for ((i = 0; i < $num_gpus; ++i)); do
+    {
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+            --config $train_config \
+            --data_type raw \
+            --symbol_table $dict \
+            --train_data data/$train_set/data.list \
+            --cv_data data/dev/data.list \
+            ${checkpoint:+--checkpoint $checkpoint} \
+            --model_dir $dir \
+            --ddp.init_method $init_method \
+            --ddp.world_size $world_size \
+            --ddp.rank $rank \
+            --ddp.dist_backend $dist_backend \
+            --num_workers 4 \
+            $cmvn_opts \
+            2>&1 | tee -a $dir/train.log || exit 1;
+    } &
+    done
+    wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best \
+            2>&1 | tee -a $dir/average.log || exit 1;
+    fi
+
+    for mode in ${decode_modes}; do
+    {
+        test_dir=$dir/test_${mode}_chunk_${decoding_chunk_size}
+        mkdir -p $test_dir
+        python wenet/bin/recognize.py --gpu 0 \
+            --mode $mode \
+            --config $dir/train.yaml \
+            --data_type raw \
+            --test_data data/test/data.list \
+            --checkpoint $decode_checkpoint \
+            --beam_size 10 \
+            --batch_size 1 \
+            --penalty 0.0 \
+            --dict $dict \
+            --ctc_weight $rescore_ctc_weight \
+            --transducer_weight $rescore_transducer_weight \
+            --attn_weight $rescore_attn_weight \
+            --search_ctc_weight $search_ctc_weight \
+            --search_transducer_weight $search_transducer_weight \
+            --result_file $test_dir/text \
+            ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+         python tools/compute-wer.py --char=1 --v=1 \
+            data/test/text $test_dir/text > $test_dir/wer
+    } &
+    done
+    wait
+fi
--- a/examples/aishell2/rnnt/tools
+++ b/examples/aishell2/rnnt/tools
+../../../tools
\ No newline at end of file
--- a/examples/aishell2/rnnt/wenet
+++ b/examples/aishell2/rnnt/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/aishell2/s0/README.md
+++ b/examples/aishell2/s0/README.md
+# Performance Record
+
+## U2++ Conformer Result
+
+* Feature info: using fbank feature, with cmvn, no speed perturb, dither
+* Training info: lr 0.001, batch size 32, 8 gpus, acc_grad 1, 240 epochs, dither 1.0
+* Decoding info: ctc_weight 0.1, reverse_weight 0.4, average_num 30
+* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| ctc greedy search         | 6.18  | 6.79  |
+| ctc prefix beam search    | 6.20  | 6.80  |
+| attention rescoring       | 5.39  | 5.78  |
+| LM + attention rescoring  | 5.35  | 5.73  |
+
+## U2++ Transformer Result
+
+* Feature info: using fbank feature, with cmvn, no speed perturb
+* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 240 epochs, dither 0.0
+* Decoding info: ctc_weight 0.1, reverse_weight 0.5, average_num 30
+* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| ctc greedy search         | 7.35  | 8.23  |
+| ctc prefix beam search    | 7.36  | 8.23  |
+| attention rescoring       | 6.09  | 6.70  |
+| LM + attention rescoring  | 6.07  | 6.55  |
+
+## Unified Conformer Result
+
+* Feature info: using fbank feature, with cmvn, no speed perturb.
+* Training info: lr 0.002, batch size 16, 8 gpus, acc_grad 1, 120 epochs, dither 1.0
+* Decoding info: ctc_weight 0.5, average_num 20
+* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f
+
+| decoding mode/chunk size  | full  | 16    | 8     | 4     |
+|---------------------------|-------|-------|-------|-------|
+| attention decoder         | 6.23  | 6.42  | 6.58  | 7.20  |
+| ctc greedy search         | 6.98  | 7.75  | 8.21  | 9.91  |
+| ctc prefix beam search    | 7.02  | 7.76  | 8.21  | 9.93  |
+| attention rescoring       | 6.08  | 6.46  | 6.72  | 7.79  |
+| LM + attention rescoring  | 5.87  | 6.37  | 6.47  | 6.61  |
+
+## Unified Transformer Result
+
+* Feature info: using fbank feature, with cmvn, no speed perturb.
+* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 180 epochs, dither 0.0
+* Decoding info: ctc_weight 0.5, average_num 30
+* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f
+
+| decoding mode/chunk size  | full  | 16    | 8     | 4     |
+|---------------------------|-------|-------|-------|-------|
+| attention decoder         | 6.71  | 7.08  | 7.17  | 7.40  |
+| ctc greedy search         | 7.84  | 8.68  | 8.98  | 9.46  |
+| ctc prefix beam search    | 7.86  | 8.68  | 8.98  | 9.45  |
+| attention rescoring       | 6.71  | 7.31  | 7.51  | 7.85  |
+| LM + attention rescoring  | 6.35  | 7.02  | 7.24  | 7.52  |
--- a/examples/aishell2/s0/conf/train_u2++_conformer.yaml
+++ b/examples/aishell2/s0/conf/train_u2++_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 8
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell2/s0/conf/train_u2++_transformer.yaml
+++ b/examples/aishell2/s0/conf/train_u2++_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 1.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: false
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell2/s0/conf/train_unified_conformer.yaml
+++ b/examples/aishell2/s0/conf/train_unified_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell2/s0/conf/train_unified_transformer.yaml
+++ b/examples/aishell2/s0/conf/train_unified_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 0.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: false
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 130
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/aishell2/s0/local/prepare_data.sh
+++ b/examples/aishell2/s0/local/prepare_data.sh
+#!/usr/bin/env bash
+# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
+#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
+# Apache 2.0
+
+# transform raw AISHELL-2 data to kaldi format
+
+if [ $# != 3 ]; then
+  echo "prepare_data.sh <corpus-data-dir> <tmp-dir> <output-dir>"
+  echo " e.g prepare_data.sh /data/AISHELL-2/iOS/train data/local/train data/train"
+  exit 1;
+fi
+
+corpus=$1
+tmp=$2
+dir=$3
+
+echo "prepare_data.sh: Preparing data in $corpus"
+
+mkdir -p $tmp
+mkdir -p $dir
+
+# corpus check
+if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
+  echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
+  exit 1;
+fi
+
+# validate utt-key list
+awk '{print $1}' $corpus/wav.scp   > $tmp/wav_utt.list
+awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
+tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
+
+# wav.scp
+awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
+tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
+
+# text
+tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt
+dos2unix < $tmp/trans.txt | \
+  tools/filter_scp.pl -f 1 $tmp/utt.list - | \
+  sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \
+  sed 's/Ａ/A/g' | sed 's/Ｔ/T/g' | sed 's/Ｍ/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' | \
+  sed 's/[()]//g' | sed "s/\([^A-Z]\)'/\1/g" > $tmp/text
+
+# copy prepared resources from tmp_dir to target dir
+mkdir -p $dir
+for f in wav.scp text; do
+  cp $tmp/$f $dir/$f || exit 1;
+done
+
+echo "local/prepare_data.sh succeeded"
+exit 0;
+
--- a/examples/aishell2/s0/local/train_lms.sh
+++ b/examples/aishell2/s0/local/train_lms.sh
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+
+. tools/parse_options.sh
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+
+dir=data/local/lm
+mkdir -p $dir
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
--- a/examples/aishell2/s0/local/word_segmentation.py
+++ b/examples/aishell2/s0/local/word_segmentation.py
+#!/usr/bin/env python
+# encoding=utf-8
+# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
+#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
+# Apache 2.0
+
+from __future__ import print_function
+import sys
+import jieba
+
+if len(sys.argv) < 3:
+    sys.stderr.write(
+        "word_segmentation.py <vocab> <trans> <word-segmented-trans>\n")
+    exit(1)
+
+vocab_file = sys.argv[1]
+trans_file = sys.argv[2]
+
+jieba.set_dictionary(vocab_file)
+for line in open(trans_file, 'r', encoding='utf8'):
+    key, trans = line.strip().split(' ', 1)
+    words = jieba.cut(trans,
+                      HMM=False)  # turn off new word discovery (HMM-based)
+    new_line = key + '\t' + " ".join(words)
+    print(new_line)
--- a/examples/aishell2/s0/path.sh
+++ b/examples/aishell2/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/aishell2/s0/run.sh
+++ b/examples/aishell2/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=6
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+# modify this to your AISHELL-2 data path
+# Note: the evaluation data (dev & test) is available at AISHELL.
+# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip
+trn_set=/mnt/nfs/ptm1/open-data/AISHELL-2/iOS/data
+dev_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/dev
+tst_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/test
+
+nj=16
+dict=data/dict/lang_char.txt
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+train_config=conf/train_unified_transformer.yaml
+cmvn=true
+dir=exp/transformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Data preparation
+    local/prepare_data.sh ${trn_set} data/local/${train_set} data/${train_set} || exit 1;
+    local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1;
+    local/prepare_data.sh ${tst_set} data/local/test data/test || exit 1;
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # remove the space between the text labels for Mandarin dataset
+    for x in ${train_set} dev test; do
+        cp data/${x}/text data/${x}/text.org
+        paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \
+             | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
+            > data/${x}/text
+        rm data/${x}/text.org
+    done
+
+    tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
+        --in_scp data/${train_set}/wav.scp \
+        --out_cmvn data/$train_set/global_cmvn
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # Make train dict
+    echo "Make a dictionary"
+    mkdir -p $(dirname $dict)
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+    tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict # <eos>
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # Prepare wenet required data
+    echo "Prepare data, prepare required format"
+    for x in dev test ${train_set}; do
+        tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list
+    done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Training
+    mkdir -p $dir
+    INIT_FILE=$dir/ddp_init
+    # You had better rm it manually before you start run.sh on first node.
+    # rm -f $INIT_FILE # delete old one before starting
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    # The number of gpus runing on each node/machine
+    num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+    # Use "nccl" if it works, otherwise use "gloo"
+    dist_backend="gloo"
+    # The total number of processes/gpus, so that the master knows
+    # how many workers to wait for.
+    # More details about ddp can be found in
+    # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+    world_size=`expr $num_gpus \* $num_nodes`
+    echo "total gpus is: $world_size"
+    cmvn_opts=
+    $cmvn && cp data/${train_set}/global_cmvn $dir
+    $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+    # train.py will write $train_config to $dir/train.yaml with model input
+    # and output dimension, train.yaml will be used for inference or model
+    # export later
+    for ((i = 0; i < $num_gpus; ++i)); do
+    {
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+            --config $train_config \
+            --data_type raw \
+            --symbol_table $dict \
+            --train_data data/$train_set/data.list \
+            --cv_data data/dev/data.list \
+            ${checkpoint:+--checkpoint $checkpoint} \
+            --model_dir $dir \
+            --ddp.init_method $init_method \
+            --ddp.world_size $world_size \
+            --ddp.rank $rank \
+            --ddp.dist_backend $dist_backend \
+            --num_workers 2 \
+            $cmvn_opts
+    } &
+    done
+    wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best
+    fi
+    # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+    # -1 for full chunk
+    decoding_chunk_size=
+    ctc_weight=0.5
+    for mode in ${decode_modes}; do
+    {
+        test_dir=$dir/test_${mode}
+        mkdir -p $test_dir
+        python wenet/bin/recognize.py --gpu 0 \
+            --mode $mode \
+            --config $dir/train.yaml \
+            --data_type raw \
+            --test_data data/test/data.list \
+            --checkpoint $decode_checkpoint \
+            --beam_size 10 \
+            --batch_size 1 \
+            --penalty 0.0 \
+            --dict $dict \
+            --ctc_weight $ctc_weight \
+            --result_file $test_dir/text \
+            ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+         python tools/compute-wer.py --char=1 --v=1 \
+            data/test/text $test_dir/text > $test_dir/wer
+    } &
+    done
+    wait
+
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # Export the best model you want
+    python wenet/bin/export_jit.py \
+        --config $dir/train.yaml \
+        --checkpoint $dir/avg_${average_num}.pt \
+        --output_file $dir/final.zip \
+        --output_quant_file $dir/final_quant.zip
+fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  # 7.1 Prepare dict
+  unit_file=$dict
+  download_dir=data/local/DaCiDian
+  git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir
+  mkdir -p data/local/dict
+  cp $unit_file data/local/dict/units.txt
+  tools/fst/prepare_dict.py $unit_file $download_dir/word_to_pinyin.txt \
+      data/local/dict/lexicon.txt
+  # 7.2 Segment text
+  pip install jieba
+  lm=data/local/lm
+  mkdir -p $lm
+  awk '{print $1}' data/local/dict/lexicon.txt | \
+      awk '{print $1,99}' > $lm/word_seg_vocab.txt
+  python local/word_segmentation.py $lm/word_seg_vocab.txt \
+      data/train/text > $lm/text
+  # 7.3 Train lm
+  local/train_lms.sh
+  # 7.4 Build decoding TLG
+  tools/fst/compile_lexicon_token_fst.sh \
+      data/local/dict data/local/tmp data/local/lang
+  tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+  # 7.5 Decoding with runtime
+  # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
+  reverse_weight=0.0
+  chunk_size=-1
+  ./tools/decode.sh --nj 16 --chunk_size $chunk_size\
+      --beam 15.0 --lattice_beam 7.5 --max_active 7000 --blank_skip_thresh 0.98 \
+      --ctc_weight 0.3 --rescoring_weight 1.0 --reverse_weight $reverse_weight\
+      --fst_path data/lang_test/TLG.fst \
+      --dict_path data/lang_test/words.txt \
+      data/test/wav.scp data/test/text $dir/final.zip data/lang_test/units.txt \
+      $dir/lm_with_runtime
+  # See $dir/lm_with_runtime for wer
+  tail $dir/lm_with_runtime/wer
+fi
--- a/examples/aishell2/s0/tools
+++ b/examples/aishell2/s0/tools
+../../../tools/
\ No newline at end of file
--- a/examples/aishell2/s0/wenet
+++ b/examples/aishell2/s0/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/aishell4/s0/README.md
+++ b/examples/aishell4/s0/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: using fbank feature, cmvn, without speed perturb (not supported segments yet)
+* Training info: lr 0.001, max_frames_in_batch 15000, 8 gpu, acc_grad 4, 100 epochs
+* Decoding info: ctc_weight 0.5, average_num 30
+
+
+| decoding mode       | Test WER |
+|---------------------|----------|
+| attention rescoring |  32.58%  |
--- a/examples/aishell4/s0/conf/train_conformer.yaml
+++ b/examples/aishell4/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 4096
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    #resample_conf:
+    #    resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        max_frames_in_batch: 15000
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 100
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 1000
--- a/examples/aishell4/s0/local/aishell4_process_textgrid.py
+++ b/examples/aishell4/s0/local/aishell4_process_textgrid.py
+# -*- coding: utf-8 -*-
+"""
+Process the textgrid files
+"""
+import argparse
+import codecs
+from pathlib import Path
+import textgrid
+
+
+class Segment(object):
+    def __init__(self, uttid, spkr, stime, etime, text):
+        self.uttid = uttid
+        self.spkr = spkr
+        self.stime = round(stime, 2)
+        self.etime = round(etime, 2)
+        self.text = text
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="process the textgrid files")
+    parser.add_argument("--path", type=str, required=True, help="Data path")
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    wav_scp = codecs.open(Path(args.path) / "wav.scp", "r", "utf-8")
+    textgrid_flist = codecs.open(
+        Path(args.path) / "textgrid.flist", "r", "utf-8")
+    # get the path of textgrid file for each utterance
+    utt2textgrid = {}
+    for line in textgrid_flist:
+        path = Path(line.strip())
+        # the name of textgrid file is different between training and test set
+        if "train" in path.parts:
+            uttid = "%s_%s" % (path.parts[-2], path.stem)
+        else:
+            uttid = path.stem
+        utt2textgrid[uttid] = path
+    # parse the textgrid file for each utterance
+    all_segments = []
+    for line in wav_scp:
+        uttid = line.strip().split(" ")[0]
+        if uttid not in utt2textgrid:
+            print("%s doesn't have transcription" % uttid)
+            continue
+        segments = []
+        tg = textgrid.TextGrid.fromFile(utt2textgrid[uttid])
+        for i in range(tg.__len__()):
+            for j in range(tg[i].__len__()):
+                if tg[i][j].mark.strip():
+                    segments.append(
+                        Segment(
+                            uttid,
+                            tg[i].name,
+                            tg[i][j].minTime,
+                            tg[i][j].maxTime,
+                            tg[i][j].mark.strip(),
+                        ))
+
+        segments = sorted(segments, key=lambda x: x.stime)
+        all_segments += segments
+
+    wav_scp.close()
+    textgrid_flist.close()
+
+    segments_file = codecs.open(Path(args.path) / "segments_all", "w", "utf-8")
+    utt2spk_file = codecs.open(Path(args.path) / "utt2spk_all", "w", "utf-8")
+    text_file = codecs.open(Path(args.path) / "text_all", "w", "utf-8")
+    utt2dur_file = codecs.open(Path(args.path) / "utt2dur_all", "w", "utf-8")
+
+    for i in range(len(all_segments)):
+        utt_name = "%s-%s-%07d-%07d" % (
+            all_segments[i].uttid,
+            all_segments[i].spkr,
+            all_segments[i].stime * 100,
+            all_segments[i].etime * 100,
+        )
+
+        segments_file.write("%s %s %.2f %.2f\n" % (
+            utt_name,
+            all_segments[i].uttid,
+            all_segments[i].stime,
+            all_segments[i].etime,
+        ))
+        utt2spk_file.write(
+            "%s %s-%s\n" %
+            (utt_name, all_segments[i].uttid, all_segments[i].spkr))
+        text_file.write("%s %s\n" % (utt_name, all_segments[i].text))
+        utt2dur_file.write(
+            "%s %.2f\n" %
+            (utt_name, all_segments[i].etime - all_segments[i].stime))
+        if len(all_segments[i].text) / (all_segments[i].etime -
+                                        all_segments[i].stime) > 100:
+            print(utt_name)
+            print(
+                len(all_segments[i].text) /
+                (all_segments[i].etime - all_segments[i].stime))
+
+    segments_file.close()
+    utt2spk_file.close()
+    text_file.close()
+    utt2dur_file.close()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)
--- a/examples/aishell4/s0/local/apply_map.pl
+++ b/examples/aishell4/s0/local/apply_map.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This program is a bit like ./sym2int.pl in that it applies a map
+# to things in a file, but it's a bit more general in that it doesn't
+# assume the things being mapped to are single tokens, they could
+# be sequences of tokens.  See the usage message.
+
+
+$permissive = 0;
+
+for ($x = 0; $x <= 2; $x++) {
+
+  if (@ARGV > 0 && $ARGV[0] eq "-f") {
+    shift @ARGV;
+    $field_spec = shift @ARGV;
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec";
+    }
+  }
+
+  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
+    shift @ARGV;
+    # Mapping is optional (missing key is printed to output)
+    $permissive = 1;
+  }
+}
+
+if(@ARGV != 1) {
+  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
+  print STDERR <<'EOF';
+Usage: apply_map.pl [options] map <input >output
+ options: [-f <field-range> ] [--permissive]
+   This applies a map to some specified fields of some input text:
+   For each line in the map file: the first field is the thing we
+   map from, and the remaining fields are the sequence we map it to.
+   The -f (field-range) option says which fields of the input file the map
+   map should apply to.
+   If the --permissive option is supplied, fields which are not present
+   in the map will be left as they were.
+ Applies the map 'map' to all input text, where each line of the map
+ is interpreted as a map from the first field to the list of the other fields
+ Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
+ range in the input to apply the map to.
+ e.g.: echo A B | apply_map.pl a.txt
+ where a.txt is:
+ A a1 a2
+ B b
+ will produce:
+ a1 a2 b
+EOF
+  exit(1);
+}
+
+($map_file) = @ARGV;
+open(M, "<$map_file") || die "Error opening map file $map_file: $!";
+
+while (<M>) {
+  @A = split(" ", $_);
+  @A >= 1 || die "apply_map.pl: empty line.";
+  $i = shift @A;
+  $o = join(" ", @A);
+  $map{$i} = $o;
+}
+
+while(<STDIN>) {
+  @A = split(" ", $_);
+  for ($x = 0; $x < @A; $x++) {
+    if ( (!defined $field_begin || $x >= $field_begin)
+         && (!defined $field_end || $x <= $field_end)) {
+      $a = $A[$x];
+      if (!defined $map{$a}) {
+        if (!$permissive) {
+          die "apply_map.pl: undefined key $a in $map_file\n";
+        } else {
+          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
+        }
+      } else {
+        $A[$x] = $map{$a};
+      }
+    }
+  }
+  print join(" ", @A) . "\n";
+}