Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 31
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 2000
min_length: 100
token_max_length: 160
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 28
grad_clip: 5
accum_grad: 4
max_epoch: 30
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 100000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 31
use_cnn_module: True
cnn_module_norm: 'layer_norm'
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
reverse_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 2000
min_length: 100
token_max_length: 160
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 20
grad_clip: 5
accum_grad: 4
max_epoch: 50
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 100000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm'
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
reverse_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 2000
min_length: 100
token_max_length: 160
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 3
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 28
grad_clip: 5
accum_grad: 1
max_epoch: 50
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 80000
#!/usr/bin/env python
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Corporation (Author: Di Wu)
import sys
import os
import argparse
import json
def get_args():
parser = argparse.ArgumentParser(description="""
This script is used to process raw json dataset of GigaSpeech,
where the long wav is splitinto segments and
data of wenet format is generated.
""")
parser.add_argument('input_json', help="""Input json file of Gigaspeech""")
parser.add_argument('output_dir', help="""Output dir for prepared data""")
args = parser.parse_args()
return args
def meta_analysis(input_json, output_dir):
input_dir = os.path.dirname(input_json)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
try:
with open(input_json, 'r') as injson:
json_data = json.load(injson)
except Exception:
sys.exit(f'Failed to load input json file: {input_json}')
else:
if json_data['audios'] is not None:
with open(f'{output_dir}/text', 'w') as utt2text, \
open(f'{output_dir}/segments', 'w') as segments, \
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
open(f'{output_dir}/wav.scp', 'w') as wavscp, \
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
open(f'{output_dir}/reco2dur', 'w') as reco2dur:
for long_audio in json_data['audios']:
try:
long_audio_path = os.path.realpath(
os.path.join(input_dir, long_audio['path']))
aid = long_audio['aid']
segments_lists = long_audio['segments']
duration = long_audio['duration']
assert (os.path.exists(long_audio_path))
assert ('opus' == long_audio['format'])
assert (16000 == long_audio['sample_rate'])
except AssertionError:
print(f'Warning: {aid} something is wrong, maybe'
'AssertionError, skipped')
continue
except Warning:
print(f'Warning: {aid} something is wrong, maybe the'
'error path: {long_audio_path}, skipped')
continue
else:
wavscp.write(f'{aid}\t{long_audio_path}\n')
reco2dur.write(f'{aid}\t{duration}\n')
for segment_file in segments_lists:
try:
sid = segment_file['sid']
start_time = segment_file['begin_time']
end_time = segment_file['end_time']
dur = end_time - start_time
text = segment_file['text_tn']
segment_subsets = segment_file["subsets"]
except Warning:
print(f'Warning: {segment_file} something is'
'wrong, skipped')
continue
else:
utt2text.write(f'{sid}\t{text}\n')
segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
)
utt2dur.write(f'{sid}\t{dur}\n')
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
def main():
args = get_args()
meta_analysis(args.input_json, args.output_dir)
if __name__ == '__main__':
main()
#!/usr/bin/env bash
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Seasalt AI, Inc (Author: Guoguo Chen)
# Mobvoi Corporation (Author: Di Wu)
set -e
set -o pipefail
stage=1
prefix=
garbage_utterance_tags="<SIL> <MUSIC> <NOISE> <OTHER>"
punctuation_tags="<COMMA> <EXCLAMATIONPOINT> <PERIOD> <QUESTIONMARK>"
train_subset=XL
. ./tools/parse_options.sh || exit 1;
filter_by_id () {
idlist=$1
input=$2
output=$3
field=1
if [ $# -eq 4 ]; then
field=$4
fi
cat $input | perl -se '
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
while(<>) {
@A = split;
@A > 0 || die "Invalid file line $_";
@A >= $field || die "Invalid file line $_";
if ($seen{$A[$field-1]}) {
print $_;
}
}' -- -idlist="$idlist" -field="$field" > $output ||\
(echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
}
subset_data_dir () {
utt_list=$1
src_dir=$2
dest_dir=$3
mkdir -p $dest_dir || exit 1;
# wav.scp text segments utt2dur
filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
(echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
(echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
(echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
(echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
rm -f $dest_dir/reco
}
if [ $# -ne 2 ]; then
echo "Usage: $0 [options] <gigaspeech-dataset-dir> <data-dir>"
echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/"
echo ""
echo "This script takes the GigaSpeech source directory, and prepares the"
echo "WeNet format data directory."
echo " --garbage-utterance-tags <tags> # Tags for non-speech."
echo " --prefix <prefix> # Prefix for output data directory."
echo " --punctuation-tags <tags> # Tags for punctuations."
echo " --stage <stage> # Processing stage."
echo " --train-subset <XL|L|M|S|XS> # Train subset to be created."
exit 1
fi
gigaspeech_dir=$1
data_dir=$2
declare -A subsets
subsets=(
[XL]="train_xl"
[L]="train_l"
[M]="train_m"
[S]="train_s"
[XS]="train_xs"
[DEV]="dev"
[TEST]="test")
prefix=${prefix:+${prefix}_}
corpus_dir=$data_dir/${prefix}corpus/
if [ $stage -le 1 ]; then
echo "$0: Extract meta into $corpus_dir"
# Sanity check.
[ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\
echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1;
[ ! -d $gigaspeech_dir/audio ] &&\
echo "$0: Please download $gigaspeech_dir/audio!" && exit 1;
[ ! -d $corpus_dir ] && mkdir -p $corpus_dir
# Files to be created:
# wav.scp text segments utt2dur
python3 local/extract_meta.py \
$gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1;
fi
if [ $stage -le 2 ]; then
echo "$0: Filter $corpus_dir/text"
# Delete utterances with garbage meta tags
for tag in $garbage_utterance_tags; do
sed -i "/${tag}/d" $corpus_dir/text
done
# Delete punctuations in utterances
for tag in $punctuation_tags; do
sed -i "s/${tag}//g" $corpus_dir/text
done
# Ensure space only appears once and utt is seprated with others by '\t'
sed -i 's/\t/ /g' $corpus_dir/text
sed -i 's/[ ][ ]*/ /g' $corpus_dir/text
sed -i 's/ /\t/' $corpus_dir/text
fi
if [ $stage -le 3 ]; then
echo "$0: Split data to train, dev and test"
# Split data to train, dev and test.
[ ! -f $corpus_dir/utt2subsets ] &&\
echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
for label in $train_subset DEV TEST; do
if [ ! ${subsets[$label]+set} ]; then
echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1;
fi
subset=${subsets[$label]}
[ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
grep "{$label}" $corpus_dir/utt2subsets \
> $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
$corpus_dir $data_dir/${prefix}$subset || exit 1;
done
fi
echo "$0: Done"
#!/usr/bin/env python3
import os
import argparse
conversational_filler = [
'UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER', 'OOF', 'HEE',
'ACH', 'EEE', 'EW'
]
unk_tags = ['<UNK>', '<unk>']
gigaspeech_punctuations = [
'<COMMA>', '<PERIOD>', '<QUESTIONMARK>', '<EXCLAMATIONPOINT>'
]
gigaspeech_garbage_utterance_tags = ['<SIL>', '<NOISE>', '<MUSIC>', '<OTHER>']
non_scoring_words = conversational_filler + unk_tags + \
gigaspeech_punctuations + gigaspeech_garbage_utterance_tags
def asr_text_post_processing(text):
# 1. convert to uppercase
text = text.upper()
# 2. remove hyphen
# "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART"
text = text.replace('-', ' ')
# 3. remove non-scoring words from evaluation
remaining_words = []
for word in text.split():
if word in non_scoring_words:
continue
remaining_words.append(word)
return ' '.join(remaining_words)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='''This script evaluates GigaSpeech ASR
result via SCTK's tool sclite''')
parser.add_argument(
'ref',
type=str,
help="sclite's standard transcription(trn) reference file")
parser.add_argument(
'hyp',
type=str,
help="sclite's standard transcription(trn) hypothesis file")
parser.add_argument('work_dir', type=str, help='working dir')
args = parser.parse_args()
if not os.path.isdir(args.work_dir):
os.mkdir(args.work_dir)
REF = os.path.join(args.work_dir, 'REF')
HYP = os.path.join(args.work_dir, 'HYP')
RESULT = os.path.join(args.work_dir, 'RESULT')
for io in [(args.ref, REF), (args.hyp, HYP)]:
with open(io[0],
'r', encoding='utf8') as fi, open(io[1],
'w+',
encoding='utf8') as fo:
for line in fi:
line = line.strip()
if line:
cols = line.split()
text = asr_text_post_processing(' '.join(cols[0:-1]))
uttid_field = cols[-1]
print(F'{text} {uttid_field}', file=fo)
os.system(F'sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}'
) # GigaSpeech's uttid comforms to swb
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2021 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# data
# use your own data path, you can contact gigaspeech@speechcolab.orgfor getting data for data information about gigaspeech
# the preparation of gigaspeech dataset for wenet can be found https://github.com/SpeechColab/GigaSpeech
giga_data_dir=/export/expts6/corpus/data/en-asr-data/16k/GigaSpeech
shards_dir=/ssd/nfs06/unified_data/giga_shards
# gigaspeech training set
set=XL
train_set=train_`echo $set |tr 'A-Z' 'a-z'`
train_dev=dev
recog_set=test
# wav data dir
data=data
nj=16
# Optional train_config
# 1. conf/train_transformer.yaml: Standard Conformer
# 2. conf/train_transformer_bidecoder.yaml: Bidecoder Conformer
train_config=conf/train_conformer_bidecoder.yaml
checkpoint=
cmvn=false
do_delta=false
dir=exp/sp_spec_aug
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
# maybe you can try to adjust it if you can not get close results as README.md
average_num=3
decode_modes="attention_rescoring ctc_greedy_search"
. tools/parse_options.sh || exit 1;
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
set -e
set -u
set -o pipefail
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: Data preparation"
local/gigaspeech_data_prep.sh --train-subset $set --stage 1 $giga_data_dir $data
sed -i "s/\t/ /g" $data/${train_set}/text
sed -i "s/\t/ /g" $data/${train_dev}/text
sed -i "s/\t/ /g" $data/${recog_set}/text
for x in $train_dev $train_set $recog_set; do
paste -d " " <(cut -f1 -d " " $data/$x/text) <(cut -f1 -d " " $data/$x/text) > $data/$x/spk2utt
cp $data/$x/spk2utt $data/$x/utt2spk
tools/fix_data_dir.sh $data/$x
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
echo "stage 1: generate segmented wav.scp and compute cmvn"
# the format of wav.segment.scp is:
# POD1000000004_S0000000 /GigaSpeech/audio/podcast/P0000/POD1000000004.opus,0.0,10.197
# 0.0 is start time, 10.197 is end time (second)
for x in $train_dev $train_set $recog_set; do
python tools/segment.py --segments $data/$x/segments \
--input $data/$x/wav.scp \
--output $data/$x/wav.segment.scp
done
# optional
# compute cmvn, perhaps you can sample some segmented examples fron wav.scp for cmvn computation
python tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp $data/$train_set/wav.segment.scp \
--out_cmvn $data/$train_set/global_cmvn
fi
dict=$data/lang_char_$set/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=$data/lang_char_$set/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p $data/lang_char_$set/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " $data/${train_set}/text > $data/lang_char_$set/input.txt
tools/spm_train --input=$data/lang_char_$set/input.txt --vocab_size=${nbpe} \
--model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model --output_format=piece \
< $data/lang_char_$set/input.txt | \
tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Making shards, please wait..."
RED='\033[0;31m'
NOCOLOR='\033[0m'
echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
for x in $train_dev $train_set $recog_set; do
dst=$shards_dir/$x
mkdir -p $dst
tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text \
$(realpath $dst) data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type "shard" \
--symbol_table $dict \
--bpe_model $bpemodel.model \
--train_data $data/$train_set/data.list \
--cv_data $data/$train_dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 16 \
$cmvn_opts
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# TODO, Add model average here
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
# Polling GPU id begin with index 0
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
idx=0
for test in $recog_set; do
for mode in ${decode_modes}; do
{
{
test_dir=$dir/${test}_${mode}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type "shard" \
--symbol_table $dict \
--bpe_model $bpemodel.model \
--test_data $data/$test/format.data \
--checkpoint $decode_checkpoint \
--beam_size 20 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--result_file $test_dir/text_bpe \
--ctc_weight $ctc_weight \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value
paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text
# a raw version wer without refining processs
python tools/compute-wer.py --char=1 --v=1 \
$data/$test/text $test_dir/text > $test_dir/wer
# for gigaspeech scoring
cat $test_dir/text_bpe_key_tmp | sed -e "s/^/(/g" | sed -e "s/$/)/g" > $test_dir/hyp_key
paste -d " " $test_dir/text_value $test_dir/hyp_key > $test_dir/hyp
paste -d " " <(cut -f2- -d " " $data/$test/text) \
<(cut -f1 -d " " $data/$test/text | \
sed -e "s/^/(/g" | sed -e "s/$/)/g") > $data/$test/ref
local/gigaspeech_scoring.py $data/$test/ref $test_dir/hyp $test_dir
} &
((idx+=1))
if [ $idx -eq $num_gpus ]; then
idx=0
fi
}
done
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
## Conformer Result (Old IO)
* Feature info: using fbank feature, with cmvn, with speed perturb.
* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 240 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 30
| decoding mode | |
|--------------------------|-------|
| attention decoder | 21.9 |
| ctc greedy search | 21.15 |
| ctc prefix beam search | 21.13 |
| attention rescoring | 20.47 |
## Conformer Result (New IO)
* Feature info: using fbank feature, with cmvn, with speed perturb.
* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 133 epochs, dither 0.1
* Decoding info: ctc_weight 0.5, average_num 30
| decoding mode | |
|--------------------------|-------|
| attention decoder | 21.42 |
| ctc greedy search | 21.16 |
| ctc prefix beam search | 21.18 |
| attention rescoring | 20.42 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# feature extraction
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
max_output_input_ratio: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/usr/bin/env bash
. ./path.sh || exit 1;
if [ $# != 2 ]; then
echo "Usage: $0 <audio-path> <text-path>"
echo " $0 /export/corpora/LDC03S04 /export/corpora/LDC03T19"
exit 1;
fi
hkust_audio_dir=$1
hkust_text_dir=$2
train_dir=data/local/train
dev_dir=data/local/dev
train_dev=train_dev
train_nodev=train_nodev
nj=16
mkdir -p $train_dir
mkdir -p $dev_dir
#data directory check
if [ ! -d $hkust_audio_dir ] || [ ! -d $hkust_text_dir ]; then
echo "Error: $0 requires two directory arguments"
exit 1;
fi
#find sph audio file for train dev resp.
find $hkust_audio_dir -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist || exit 1;
find $hkust_audio_dir -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist || exit 1;
n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
[ $n -ne 897 ] && \
echo Warning: expected 897 data data files, found $n
#Transcriptions preparation
#collect all trans, convert encodings to utf-8,
find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
iconv -f GBK -t UTF-8 | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
if (@A <= 1) { next; }
if ($A[0] eq "#") { $utt_id = $A[1]; }
if (@A >= 3) {
$A[2] =~ s:^([AB])\:$:$1:;
printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
for($n = 3; $n < @A; $n++) { print " $A[$n]" };
print "\n";
}
}
' | sort -k1 > $train_dir/transcripts.txt || exit 1;
find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
iconv -f GBK -t UTF-8 | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
if (@A <= 1) { next; }
if ($A[0] eq "#") { $utt_id = $A[1]; }
if (@A >= 3) {
$A[2] =~ s:^([AB])\:$:$1:;
printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
for($n = 3; $n < @A; $n++) { print " $A[$n]" };
print "\n";
}
}
' | sort -k1 > $dev_dir/transcripts.txt || exit 1;
#transcripts normalization and segmentation
cat $train_dir/transcripts.txt |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/<\/noise>//g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
awk '{if (NF > 1) print $0;}' |\
local/hkust_normalize.pl |\
awk '{if (NF > 0) print $0;}' > $train_dir/text || exit 1;
cat $dev_dir/transcripts.txt |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/<\/noise>//g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
awk '{if (NF > 1) print $0;}' |\
local/hkust_normalize.pl |\
awk '{if (NF > 0) print $0;}' > $dev_dir/text || exit 1;
# some data is corrupted. Delete them
cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp
mv tmp $train_dir/text || exit 1;
#Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100}' <$train_dir/text > $train_dir/segments
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $train_dir/sph.flist > $train_dir/sph.scp
awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
bash tools/sph2wav.sh --nj ${nj} $train_dir/sph.scp $train_dir/segments $train_dir/wav.scp
bash tools/sph2wav.sh --nj ${nj} $dev_dir/sph.scp $dev_dir/segments $dev_dir/wav.scp
#side A - channel 1, side B - channel 2
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
# to the file name sw02001 and the A, e.g.
# sw02001-A sw02001 A
# In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring.
cat $train_dir/wav_ori.scp | awk '{print $1}' | \
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
> $train_dir/reco2file_and_channel || exit 1;
cat $dev_dir/wav_ori.scp | awk '{print $1}' | \
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
> $dev_dir/reco2file_and_channel || exit 1;
cat $train_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $train_dir/utt2spk || exit 1;
cat $train_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $train_dir/spk2utt || exit 1;
cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir/utt2spk || exit 1;
cat $dev_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
mkdir -p data/train data/dev
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/train/$f data/train/$f || exit 1;
done
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/dev/$f data/dev/$f || exit 1;
done
tools/subset_data_dir.sh --first data/train 4001 data/${train_dev}
n=$(($(wc -l < data/train/segments) - 4001))
tools/subset_data_dir.sh --last data/train ${n} data/${train_nodev}
echo "$0: HKUST data preparation succeeded"
exit 0
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright Chao Weng
# normalizations for hkust trascript
# see the docs/trans-guidelines.pdf for details
while (<STDIN>) {
@A = split(" ", $_);
print "$A[0] ";
for ($n = 1; $n < @A; $n++) {
$a = $A[$n];
if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}")
|| ($a eq "{lipsmack}")) {next;}
if (($a eq "{laugh}")) {next;}
if (($a eq "<noise>")) {next;}
$tmp = $a;
if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:g; }
if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; }
if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; }
if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);}
print "$tmp ";
}
print "\n";
}
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3"
stage=4 # start from 0 if you need to start from data preparation
stop_stage=4
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
nj=16
feat_dir=raw_wav
data_type=raw
num_utts_per_shard=1000
prefetch=100
train_set=train_nodev
dev_set=train_dev
# Optional train_config
# 1. conf/train_transformer.yaml: Standard transformer
# 2. conf/train_conformer.yaml: Standard conformer
# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
train_config=conf/train_conformer.yaml
# English modeling unit
# Optional 1. bpe 2. char
en_modeling_unit=bpe
dict=data/dict_$en_modeling_unit/lang_char.txt
cmvn=true
debug=false
num_workers=2
dir=exp/conformer
checkpoint=
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=30
decode_modes="ctc_greedy_search ctc_prefix_beam_search
attention attention_rescoring"
. tools/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Data preparation
local/hkust_data_prep.sh /mnt/cfs/database/hkust/LDC2005S15/ \
/mnt/cfs/database/hkust/LDC2005T32/ || exit 1;
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# For wav feature, just copy the data. Fbank extraction is done in training
mkdir -p ${feat_dir}_${en_modeling_unit}
for x in ${train_set} ${dev_set}; do
cp -r data/$x ${feat_dir}_${en_modeling_unit}
done
cp -r data/dev ${feat_dir}_${en_modeling_unit}/test
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn
fi
# This bpe model is trained on librispeech training data set.
bpecode=conf/train_960_unigram5000.model
trans_type_ops=
bpe_ops=
if [ $en_modeling_unit = "bpe" ]; then
trans_type_ops="--trans_type cn_char_en_bpe"
bpe_ops="--bpecode ${bpecode}"
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
paste -d " " \
<(cut -f 1 -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text) \
<(cut -f 2- -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text \
| tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \
| sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " " ) \
> ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict
sed -i 's/\xEF\xBB\xBF//' \
${feat_dir}_${en_modeling_unit}/${train_set}/text4dict
tools/text2token.py -s 1 -n 1 -m ${bpecode} \
${feat_dir}_${en_modeling_unit}/${train_set}/text4dict ${trans_type_ops} \
| cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' \
| grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \
| grep -v "…" \
| awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in ${dev_set} ${train_set} test; do
if [ $data_type == "shard" ]; then
tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 ${feat_dir}_${en_modeling_unit}/$x/wav.scp \
${feat_dir}_${en_modeling_unit}/$x/text \
$(realpath ${feat_dir}_${en_modeling_unit}/$x/shards) \
${feat_dir}_${en_modeling_unit}/$x/data.list
else
tools/make_raw_list.py ${feat_dir}_${en_modeling_unit}/$x/wav.scp \
${feat_dir}_${en_modeling_unit}/$x/text \
${feat_dir}_${en_modeling_unit}/$x/data.list
fi
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--prefetch $prefetch \
--train_data ${feat_dir}_${en_modeling_unit}/$train_set/data.list \
--cv_data ${feat_dir}_${en_modeling_unit}/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory \
--bpe_model ${bpecode}
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=-1
ctc_weight=0.5
idx=0
for mode in ${decode_modes}; do
{
test_dir="$dir/"`
`"test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size}/test"
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type $data_type \
--test_data ${feat_dir}_${en_modeling_unit}/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--result_file $test_dir/text_${en_modeling_unit} \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
if [ $en_modeling_unit == "bpe" ]; then
tools/spm_decode --model=${bpecode} --input_format=piece \
< $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" > $test_dir/text
else
cat $test_dir/text_${en_modeling_unit} \
| sed -e "s/▁/ /g" > $test_dir/text
fi
# Cer used to be consistent with kaldi & espnet
python tools/compute-cer.py --char=1 --v=1 \
${feat_dir}_${en_modeling_unit}/test/text $test_dir/text > $test_dir/wer
} &
((idx+=1))
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip \
--output_quant_file $dir/final_quant.zip
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# Performance Record
## Conformer Bidecoder Transducer Result
* Feature info: using fbank feature, dither, cmvn, online speed perturb
* Training info: lr 0.001, dynamic batch with max_frames_in_batch 4000, 8 gpu, acc_grad 1, 60 epochs
* Training weight info: transducer_weight 0.75, ctc_weight 0.1, reverse_weight 0.30, average_num 10
* Predictor type: lstm
| decoding mode | dev_clean | dev_other | test_clean | test_other |
|-----------------------|------------|-----------|------------|------------|
| rnnt_greedy_search | 3.42% | 8.99% | 3.56% | 9.15% |
| rnnt_beam_search | 3.35% | 8.77% | 3.45% | 8.78% |
| rnnt_beam_att_rescore | 3.25% | 8.66% | 3.41% | 8.68% |
Pretrained model: https://huggingface.co/yuekai/wenet-asr-librispeech-conformer-transducer-mtl/blob/main/exp/conformer_transducer/avg_10.pt
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment