Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
# 2022 burkliu(boji123@aliyun.com)
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# modify this to your AISHELL-2 data path
# Note: the evaluation data (dev & test) is available at AISHELL.
# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip
train_set=/cfs/share/corpus/aishell-2/AISHELL-2/iOS/data
dev_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/dev
test_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/test
nj=16
dict=data/dict/lang_char.txt
train_set=train
train_config=conf/conformer_u2pp_rnnt.yaml
cmvn=true
dir=exp/`basename ${train_config%.*}`
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="rnnt_beam_search"
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=-1
# only used in rescore mode for weighting different scores
rescore_ctc_weight=0.5
rescore_transducer_weight=0.5
rescore_attn_weight=0.5
# only used in beam search, either pure beam search mode OR beam search inside rescoring
search_ctc_weight=0.3
search_transducer_weight=0.7
. tools/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/prepare_data.sh ${train_set} data/local/${train_set} data/${train_set} || exit 1;
local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1;
local/prepare_data.sh ${test_set} data/local/test data/test || exit 1;
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# remove the space between the text labels for Mandarin dataset
for x in ${train_set} dev test; do
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
> data/${x}/text
rm data/${x}/text.org
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/$train_set/global_cmvn
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in dev test ${train_set}; do
tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
#dist_backend="nccl"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type raw \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 4 \
$cmvn_opts \
2>&1 | tee -a $dir/train.log || exit 1;
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best \
2>&1 | tee -a $dir/average.log || exit 1;
fi
for mode in ${decode_modes}; do
{
test_dir=$dir/test_${mode}_chunk_${decoding_chunk_size}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type raw \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $rescore_ctc_weight \
--transducer_weight $rescore_transducer_weight \
--attn_weight $rescore_attn_weight \
--search_ctc_weight $search_ctc_weight \
--search_transducer_weight $search_transducer_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
done
wait
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
## U2++ Conformer Result
* Feature info: using fbank feature, with cmvn, no speed perturb, dither
* Training info: lr 0.001, batch size 32, 8 gpus, acc_grad 1, 240 epochs, dither 1.0
* Decoding info: ctc_weight 0.1, reverse_weight 0.4, average_num 30
* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
| decoding mode/chunk size | full | 16 |
|---------------------------|-------|-------|
| ctc greedy search | 6.18 | 6.79 |
| ctc prefix beam search | 6.20 | 6.80 |
| attention rescoring | 5.39 | 5.78 |
| LM + attention rescoring | 5.35 | 5.73 |
## U2++ Transformer Result
* Feature info: using fbank feature, with cmvn, no speed perturb
* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 240 epochs, dither 0.0
* Decoding info: ctc_weight 0.1, reverse_weight 0.5, average_num 30
* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
| decoding mode/chunk size | full | 16 |
|---------------------------|-------|-------|
| ctc greedy search | 7.35 | 8.23 |
| ctc prefix beam search | 7.36 | 8.23 |
| attention rescoring | 6.09 | 6.70 |
| LM + attention rescoring | 6.07 | 6.55 |
## Unified Conformer Result
* Feature info: using fbank feature, with cmvn, no speed perturb.
* Training info: lr 0.002, batch size 16, 8 gpus, acc_grad 1, 120 epochs, dither 1.0
* Decoding info: ctc_weight 0.5, average_num 20
* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f
| decoding mode/chunk size | full | 16 | 8 | 4 |
|---------------------------|-------|-------|-------|-------|
| attention decoder | 6.23 | 6.42 | 6.58 | 7.20 |
| ctc greedy search | 6.98 | 7.75 | 8.21 | 9.91 |
| ctc prefix beam search | 7.02 | 7.76 | 8.21 | 9.93 |
| attention rescoring | 6.08 | 6.46 | 6.72 | 7.79 |
| LM + attention rescoring | 5.87 | 6.37 | 6.47 | 6.61 |
## Unified Transformer Result
* Feature info: using fbank feature, with cmvn, no speed perturb.
* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 180 epochs, dither 0.0
* Decoding info: ctc_weight 0.5, average_num 30
* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f
| decoding mode/chunk size | full | 16 | 8 | 4 |
|---------------------------|-------|-------|-------|-------|
| attention decoder | 6.71 | 7.08 | 7.17 | 7.40 |
| ctc greedy search | 7.84 | 8.68 | 8.98 | 9.46 |
| ctc prefix beam search | 7.86 | 8.68 | 8.98 | 9.45 |
| attention rescoring | 6.71 | 7.31 | 7.51 | 7.85 |
| LM + attention rescoring | 6.35 | 7.02 | 7.24 | 7.52 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 1.0
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: false
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 120
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 0.0
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: false
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
# dataset related
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 130
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0
# transform raw AISHELL-2 data to kaldi format
if [ $# != 3 ]; then
echo "prepare_data.sh <corpus-data-dir> <tmp-dir> <output-dir>"
echo " e.g prepare_data.sh /data/AISHELL-2/iOS/train data/local/train data/train"
exit 1;
fi
corpus=$1
tmp=$2
dir=$3
echo "prepare_data.sh: Preparing data in $corpus"
mkdir -p $tmp
mkdir -p $dir
# corpus check
if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
exit 1;
fi
# validate utt-key list
awk '{print $1}' $corpus/wav.scp > $tmp/wav_utt.list
awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
# wav.scp
awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
# text
tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt
dos2unix < $tmp/trans.txt | \
tools/filter_scp.pl -f 1 $tmp/utt.list - | \
sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \
sed 's/A/A/g' | sed 's/T/T/g' | sed 's/M/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' | \
sed 's/[()]//g' | sed "s/\([^A-Z]\)'/\1/g" > $tmp/text
# copy prepared resources from tmp_dir to target dir
mkdir -p $dir
for f in wav.scp text; do
cp $tmp/$f $dir/$f || exit 1;
done
echo "local/prepare_data.sh succeeded"
exit 0;
#!/bin/bash
# To be run from one directory above this script.
. ./path.sh
text=data/local/lm/text
lexicon=data/local/dict/lexicon.txt
. tools/parse_options.sh
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
dir=data/local/lm
mkdir -p $dir
cleantext=$dir/text.no_oov
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $dir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $dir/train
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
ngram -lm $dir/lm.arpa -ppl $dir/heldout
#!/usr/bin/env python
# encoding=utf-8
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0
from __future__ import print_function
import sys
import jieba
if len(sys.argv) < 3:
sys.stderr.write(
"word_segmentation.py <vocab> <trans> <word-segmented-trans>\n")
exit(1)
vocab_file = sys.argv[1]
trans_file = sys.argv[2]
jieba.set_dictionary(vocab_file)
for line in open(trans_file, 'r', encoding='utf8'):
key, trans = line.strip().split(' ', 1)
words = jieba.cut(trans,
HMM=False) # turn off new word discovery (HMM-based)
new_line = key + '\t' + " ".join(words)
print(new_line)
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data preparation
stop_stage=6
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# modify this to your AISHELL-2 data path
# Note: the evaluation data (dev & test) is available at AISHELL.
# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip
trn_set=/mnt/nfs/ptm1/open-data/AISHELL-2/iOS/data
dev_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/dev
tst_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/test
nj=16
dict=data/dict/lang_char.txt
train_set=train
# Optional train_config
# 1. conf/train_transformer.yaml: Standard transformer
# 2. conf/train_conformer.yaml: Standard conformer
# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
train_config=conf/train_unified_transformer.yaml
cmvn=true
dir=exp/transformer
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/prepare_data.sh ${trn_set} data/local/${train_set} data/${train_set} || exit 1;
local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1;
local/prepare_data.sh ${tst_set} data/local/test data/test || exit 1;
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# remove the space between the text labels for Mandarin dataset
for x in ${train_set} dev test; do
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \
> data/${x}/text
rm data/${x}/text.org
done
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/$train_set/global_cmvn
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in dev test ${train_set}; do
tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type raw \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 2 \
$cmvn_opts
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
for mode in ${decode_modes}; do
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type raw \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--result_file $test_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip \
--output_quant_file $dir/final_quant.zip
fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# 7.1 Prepare dict
unit_file=$dict
download_dir=data/local/DaCiDian
git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir
mkdir -p data/local/dict
cp $unit_file data/local/dict/units.txt
tools/fst/prepare_dict.py $unit_file $download_dir/word_to_pinyin.txt \
data/local/dict/lexicon.txt
# 7.2 Segment text
pip install jieba
lm=data/local/lm
mkdir -p $lm
awk '{print $1}' data/local/dict/lexicon.txt | \
awk '{print $1,99}' > $lm/word_seg_vocab.txt
python local/word_segmentation.py $lm/word_seg_vocab.txt \
data/train/text > $lm/text
# 7.3 Train lm
local/train_lms.sh
# 7.4 Build decoding TLG
tools/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
# 7.5 Decoding with runtime
# reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
reverse_weight=0.0
chunk_size=-1
./tools/decode.sh --nj 16 --chunk_size $chunk_size\
--beam 15.0 --lattice_beam 7.5 --max_active 7000 --blank_skip_thresh 0.98 \
--ctc_weight 0.3 --rescoring_weight 1.0 --reverse_weight $reverse_weight\
--fst_path data/lang_test/TLG.fst \
--dict_path data/lang_test/words.txt \
data/test/wav.scp data/test/text $dir/final.zip data/lang_test/units.txt \
$dir/lm_with_runtime
# See $dir/lm_with_runtime for wer
tail $dir/lm_with_runtime/wer
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result
* Feature info: using fbank feature, cmvn, without speed perturb (not supported segments yet)
* Training info: lr 0.001, max_frames_in_batch 15000, 8 gpu, acc_grad 4, 100 epochs
* Decoding info: ctc_weight 0.5, average_num 30
| decoding mode | Test WER |
|---------------------|----------|
| attention rescoring | 32.58% |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 4096
min_length: 10
token_max_length: 200
token_min_length: 1
#resample_conf:
# resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'dynamic' # static or dynamic
max_frames_in_batch: 15000
grad_clip: 5
accum_grad: 4
max_epoch: 100
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 1000
# -*- coding: utf-8 -*-
"""
Process the textgrid files
"""
import argparse
import codecs
from pathlib import Path
import textgrid
class Segment(object):
def __init__(self, uttid, spkr, stime, etime, text):
self.uttid = uttid
self.spkr = spkr
self.stime = round(stime, 2)
self.etime = round(etime, 2)
self.text = text
def get_args():
parser = argparse.ArgumentParser(description="process the textgrid files")
parser.add_argument("--path", type=str, required=True, help="Data path")
args = parser.parse_args()
return args
def main(args):
wav_scp = codecs.open(Path(args.path) / "wav.scp", "r", "utf-8")
textgrid_flist = codecs.open(
Path(args.path) / "textgrid.flist", "r", "utf-8")
# get the path of textgrid file for each utterance
utt2textgrid = {}
for line in textgrid_flist:
path = Path(line.strip())
# the name of textgrid file is different between training and test set
if "train" in path.parts:
uttid = "%s_%s" % (path.parts[-2], path.stem)
else:
uttid = path.stem
utt2textgrid[uttid] = path
# parse the textgrid file for each utterance
all_segments = []
for line in wav_scp:
uttid = line.strip().split(" ")[0]
if uttid not in utt2textgrid:
print("%s doesn't have transcription" % uttid)
continue
segments = []
tg = textgrid.TextGrid.fromFile(utt2textgrid[uttid])
for i in range(tg.__len__()):
for j in range(tg[i].__len__()):
if tg[i][j].mark.strip():
segments.append(
Segment(
uttid,
tg[i].name,
tg[i][j].minTime,
tg[i][j].maxTime,
tg[i][j].mark.strip(),
))
segments = sorted(segments, key=lambda x: x.stime)
all_segments += segments
wav_scp.close()
textgrid_flist.close()
segments_file = codecs.open(Path(args.path) / "segments_all", "w", "utf-8")
utt2spk_file = codecs.open(Path(args.path) / "utt2spk_all", "w", "utf-8")
text_file = codecs.open(Path(args.path) / "text_all", "w", "utf-8")
utt2dur_file = codecs.open(Path(args.path) / "utt2dur_all", "w", "utf-8")
for i in range(len(all_segments)):
utt_name = "%s-%s-%07d-%07d" % (
all_segments[i].uttid,
all_segments[i].spkr,
all_segments[i].stime * 100,
all_segments[i].etime * 100,
)
segments_file.write("%s %s %.2f %.2f\n" % (
utt_name,
all_segments[i].uttid,
all_segments[i].stime,
all_segments[i].etime,
))
utt2spk_file.write(
"%s %s-%s\n" %
(utt_name, all_segments[i].uttid, all_segments[i].spkr))
text_file.write("%s %s\n" % (utt_name, all_segments[i].text))
utt2dur_file.write(
"%s %.2f\n" %
(utt_name, all_segments[i].etime - all_segments[i].stime))
if len(all_segments[i].text) / (all_segments[i].etime -
all_segments[i].stime) > 100:
print(utt_name)
print(
len(all_segments[i].text) /
(all_segments[i].etime - all_segments[i].stime))
segments_file.close()
utt2spk_file.close()
text_file.close()
utt2dur_file.close()
if __name__ == "__main__":
args = get_args()
main(args)
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens. See the usage message.
$permissive = 0;
for ($x = 0; $x <= 2; $x++) {
if (@ARGV > 0 && $ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
shift @ARGV;
# Mapping is optional (missing key is printed to output)
$permissive = 1;
}
}
if(@ARGV != 1) {
print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
print STDERR <<'EOF';
Usage: apply_map.pl [options] map <input >output
options: [-f <field-range> ] [--permissive]
This applies a map to some specified fields of some input text:
For each line in the map file: the first field is the thing we
map from, and the remaining fields are the sequence we map it to.
The -f (field-range) option says which fields of the input file the map
map should apply to.
If the --permissive option is supplied, fields which are not present
in the map will be left as they were.
Applies the map 'map' to all input text, where each line of the map
is interpreted as a map from the first field to the list of the other fields
Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
range in the input to apply the map to.
e.g.: echo A B | apply_map.pl a.txt
where a.txt is:
A a1 a2
B b
will produce:
a1 a2 b
EOF
exit(1);
}
($map_file) = @ARGV;
open(M, "<$map_file") || die "Error opening map file $map_file: $!";
while (<M>) {
@A = split(" ", $_);
@A >= 1 || die "apply_map.pl: empty line.";
$i = shift @A;
$o = join(" ", @A);
$map{$i} = $o;
}
while(<STDIN>) {
@A = split(" ", $_);
for ($x = 0; $x < @A; $x++) {
if ( (!defined $field_begin || $x >= $field_begin)
&& (!defined $field_end || $x <= $field_end)) {
$a = $A[$x];
if (!defined $map{$a}) {
if (!$permissive) {
die "apply_map.pl: undefined key $a in $map_file\n";
} else {
print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
}
} else {
$A[$x] = $map{$a};
}
}
}
print join(" ", @A) . "\n";
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment