Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=0 # start from 0 if you need to start from data download
stop_stage=2
# The num of nodes or machines used for multi-machine training
# Default 1 for single machine/node
# NFS will be needed if you want run multi-machine training
num_nodes=1
# The rank of each node or machine, range from 0 to num_nodes -1
# The first node/machine sets node_rank 0, the second one sets node_rank 1
# the third one set node_rank 2, and so on. Default 0
node_rank=0
# data
download_path=/root/autodl-tmp
french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19
# path to save preproecssed data
# export data=data
. ./path.sh
. ./tools/parse_options.sh || exit 1
nj=16
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=raw
num_utts_per_shard=1000
train_set=train
# Optional train_config
# 1. conf/train_transformer.yaml: Standard transformer
# 2. conf/train_conformer.yaml: Standard conformer
# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
# 6. conf/train_u2++_conformer.yaml: U2++ conformer
# 7. conf/train_u2++_transformer.yaml: U2++ transformer
train_config=conf/train_conformer.yaml
cmvn=true
dir=exp/conformer
checkpoint=
nbpe=5000
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=20
#decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
decode_modes="attention attention_rescoring"
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data download"
echo "download Dataset!"
local/download_data.sh ${download_path} ${french_data}
echo "Finish stage 0"
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
local/prepare_data.sh ${french_data}/fr
echo "Finish stage 0"
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: compute global cmvn"
# compute cmvn
python tools/compute_cmvn_stats.py --num_workers 1 --train_config $train_config \
--in_scp data/${train_set}/wav.scp \
--out_cmvn data/${train_set}/global_cmvn
echo "Finish stage 1"
fi
bpemode=unigram
dict=data/lang_char_/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=data/lang_char_/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_char_/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " data/${train_set}/text > data/lang_char_/input.txt
tools/spm_train --input=data/lang_char_/input.txt --vocab_size=${nbpe} \
--model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
tools/spm_encode --model=${bpemodel}.model --output_format=piece \
< data/lang_char_/input.txt | \
tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Prepare data, prepare required format"
for x in dev test ${train_set}; do
if [ $data_type == "shard" ]; then
python tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
--num_threads 16 data/$x/wav.scp data/$x/text \
$(realpath data/$x/shards) data/$x/data.list
else
python tools/make_raw_list.py data/$x/wav.scp data/$x/text \
data/$x/data.list
fi
done
echo "Finish stage 3"
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
mkdir -p $dir
# You have to rm `INIT_FILE` manually when you resume or restart a
# multi-machine training.
INIT_FILE=$dir/ddp_init
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py rewrite $train_config to $dir/train.yaml with model input
# and output dimension, and $dir/train.yaml will be used for inference
# and export.
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--bpe_model $bpemodel.model \
--train_data data/$train_set/data.list \
--cv_data data/dev/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
# TODO, Add model average here
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
# Polling GPU id begin with index 0
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
idx=0
for mode in ${decode_modes}; do
{
{
test_dir=$dir/test_${mode}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type "raw" \
--bpe_model $bpemodel.model \
--test_data data/test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 20 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--result_file $test_dir/text_bpe \
--ctc_weight $ctc_weight \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value
#sed -e "s/▁/ /g" $test_dir/text_bpe_value_tmp > $test_dir/text_value
paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text
# a raw version wer without refining processs
python tools/compute-wer.py --char=1 --v=1 \
data/test/text $test_dir/text > $test_dir/wer
} &
((idx+=1))
if [ $idx -eq $num_gpus ]; then
idx=0
fi
}
done
wait
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Export the best model you want
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer Result Bidecoder (large)
## Conformer Result
* Feature info: using fbank feature, cmvn, dither, online speed perturb
* Training info: train_conformer.yaml, kernel size 15, lr 0.004, batch size 12, 8 gpu, acc_grad 1, 50 epochs, dither 0.0
* Decoding info: ctc_weight 0.5, average_num 10
| decoding mode | test1 | test2 | test3 |
|----------------------------------|------------|------------|------------|
| ctc greedy search | 7.94 | 5.29 | 6.10 |
| ctc prefix beam search | 7.83+ | 5.28 | 6.08 |
| attention decoder | 7.83 | 5.63 | 6.37 |
| attention rescoring | 7.28+ | 4.81 | 5.44 |
note that "+" means we removed two <0.1s wav files in test1 before decoding.
## Conformer U2++ Result
## Conformer U2 Result
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# dataset related
dataset_conf:
filter_conf:
max_length: 2000
min_length: 50
token_max_length: 400
token_min_length: 1
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 12
grad_clip: 5
accum_grad: 1
max_epoch: 50
log_interval: 100
optim: adam
optim_conf:
lr: 0.004
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# parse xml files and output simplified version
import xml.dom.minidom
import os
import sys
import multiprocessing
def parsexml(afile, outpath):
outfile = os.path.join(outpath, afile.split('/')[-1] + '.simp')
with open(outfile, 'w') as bw:
domtree = xml.dom.minidom.parse(afile)
collection = domtree.documentElement
ipus = collection.getElementsByTagName('IPU')
for ipu in ipus:
starttime = 0
endtime = 0
if ipu.hasAttribute('IPUStartTime'):
starttime = ipu.getAttribute('IPUStartTime')
if ipu.hasAttribute('IPUEndTime'):
endtime = ipu.getAttribute('IPUEndTime')
# print('{}\t{}'.format(starttime, endtime))
# ## original format ###
wlist = list()
plainwlist = list()
pronlist = list()
# ## pronunciation ###
lemmalist = list() # lemma list
dictlemmalist = list() # dict lemma list
for suw in ipu.getElementsByTagName('SUW'): # short unit word
txt = ''
plaintxt = ''
# PhoneticTranscription
prontxt = ''
if suw.hasAttribute('OrthographicTranscription'):
txt = suw.getAttribute('OrthographicTranscription')
if suw.hasAttribute('PlainOrthographicTranscription'):
plaintxt = suw.getAttribute('PlainOrthographicTranscription')
if suw.hasAttribute('PhoneticTranscription'):
prontxt = suw.getAttribute('PhoneticTranscription')
wlist.append(txt)
plainwlist.append(plaintxt)
pronlist.append(prontxt)
lemma = ''
dictlemma = ''
if suw.hasAttribute('SUWLemma'):
lemma = suw.getAttribute('SUWLemma')
if suw.hasAttribute('SUWDictionaryForm'):
dictlemma = suw.getAttribute('SUWDictionaryForm')
lemmalist.append(lemma)
dictlemmalist.append(dictlemma)
txtsent = ' '.join(wlist)
plaintxtsent = ' '.join(plainwlist)
prontxtsent = ' '.join(pronlist)
lemmasent = ' '.join(lemmalist)
dictlemmasent = ' '.join(dictlemmalist)
outrow = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
starttime, endtime, txtsent, plaintxtsent,
prontxtsent, lemmasent, dictlemmasent)
bw.write(outrow)
def procfolder_orig(apath, outpath):
count = 0
for afile in os.listdir(apath):
if not afile.endswith('.xml'):
continue
afile = os.path.join(apath, afile)
parsexml(afile, outpath)
count += 1
print('done: {} [{}]'.format(afile, count))
def procfolder(apath, outpath):
# count = 0
fnlist = list()
for afile in os.listdir(apath):
if not afile.endswith('.xml'):
continue
fnlist.append(afile)
# now parallel processing:
nthreads = 16
for i in range(0, len(fnlist), nthreads):
# fnlist[i, i+16]
pool = multiprocessing.Pool(processes=nthreads)
for j in range(nthreads):
if i + j < len(fnlist):
afile = os.path.join(apath, fnlist[i + j])
pool.apply_async(parsexml, (afile, outpath))
pool.close()
pool.join()
print('parallel {} threads done for {} files in total.'.format(
nthreads, len(fnlist)))
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: {} <in.csj.path> <out.csj.path>".format(sys.argv[0]))
exit(1)
# e.g., csjpath='/workspace/asr/csj/'
csjpath = sys.argv[1]
outcsjpath = sys.argv[2]
apath = os.path.join(csjpath, 'XML/BaseXML/core')
apath2 = os.path.join(csjpath, 'XML/BaseXML/noncore')
outapath = os.path.join(outcsjpath, 'xml')
# create the "outapath" dir:
if not os.path.exists(outapath):
os.mkdir(outapath)
# range over the following two folders:
procfolder(apath, outapath)
procfolder(apath2, outapath)
# based on xml.simp -> start_time and end_time -> split using sox
import os
import sys
import multiprocessing
import librosa
import soundfile as sf
# use .simp as the source for .wav file splitting
def wavfn(apath):
wavdict = dict() # key=id, value=full.path of .wav
for awavfn in os.listdir(apath):
fullwavpath = os.path.join(apath, awavfn)
aid = awavfn.replace('.wav', '')
wavdict[aid] = fullwavpath
return wavdict
def xmlfn(apath):
xmldict = dict() # key=id, value=full.path of .xml.simp
for axmlfn in os.listdir(apath):
if not axmlfn.endswith('.xml.simp'):
continue
axmlfn2 = os.path.join(apath, axmlfn)
aid = axmlfn.replace('.xml.simp', '')
# print('obtain id: {}\t{}'.format(axmlfn, aid))
xmldict[aid] = axmlfn2
return xmldict
def ch2to1(f1, outf1):
wav1, _ = librosa.load(f1, sr=16000, mono=False)
if wav1.ndim == 1:
return
wav1mono = librosa.to_mono(wav1)
sf.write(outf1, wav1mono, 16000)
# print('2ch to 1ch, {} -> {}'.format(f1, outf1))
acmd = 'mv {} {}'.format(outf1, f1)
res = os.system(acmd)
# rename the .1ch file back to the .wav file and
# overwrite the old .wav file which is 2ch
# print(res, acmd)
def proc1file(fullxmlfn, fullwavfn, outwavpath):
with open(fullxmlfn) as xmlbr:
for axmlline in xmlbr.readlines():
# start.time end.time ortho plainortho phonetic
axmlline = axmlline.strip()
cols = axmlline.split('\t')
stime = cols[0]
etime = cols[1]
if len(cols) == 2:
continue # skip
basename = fullwavfn.split('/')[-1]
name2 = '{}_{}_{}.wav'.format(basename, stime, etime)
partwavfn = os.path.join(outwavpath, name2)
dur = float(etime) - float(stime)
acmd = 'sox {} {} trim {} {}'.format(fullwavfn, partwavfn, stime, dur)
res = os.system(acmd)
# print(res, acmd)
# perform 2ch to 1ch if necessary!
partwavfn1ch = partwavfn + ".1ch.wav" # NOTE must ends with '.wav'!
# otherwise, soundfile.write will give us error report!
ch2to1(partwavfn, partwavfn1ch)
def procpath(atag, csjpath, xmlsimppath, outwavpath, idset):
# atag = 'core' and 'noncore'
axmlpath = xmlsimppath
awavpath = os.path.join(csjpath, atag)
xmldict = xmlfn(axmlpath)
wavdict = wavfn(awavpath)
wavidlist = list(wavdict.keys())
# parallel processing
nthreads = 16
for i in range(0, len(wavidlist), nthreads):
pool = multiprocessing.Pool(processes=nthreads)
for j in range(nthreads):
if i + j < len(wavidlist):
wavid = wavidlist[i + j]
if len(idset) > 0 and wavid not in idset:
# when idset is not empty, then only process the ids
# that are included in idset:
continue
fullwavfn = wavdict[wavid]
if wavid in xmldict:
fullxmlfn = xmldict[wavid]
pool.apply_async(proc1file, (fullxmlfn, fullwavfn, outwavpath))
pool.close()
pool.join()
print('parallel {} threads done for {} files.'.format(
nthreads,
len(wavidlist)))
if __name__ == '__main__':
if len(sys.argv) < 4:
print(
"Usage: {}".format(sys.argv[0]) +
"<in.csj.path> <in.xml.simp.path> <out.wav.path> [id.list.fn]")
exit(1)
csjpath = sys.argv[1]
xmlsimppath = sys.argv[2]
outwavpath = sys.argv[3]
idlistfn = sys.argv[4] if len(sys.argv) == 5 else ""
idset = set()
if len(idlistfn) > 0:
with open(idlistfn) as br:
for aline in br.readlines():
aline = aline.strip()
idset.add(aline)
print(idset)
for atag in ['core', 'noncore']:
procpath(atag, csjpath, xmlsimppath, outwavpath, idset)
import os
import sys
# train test1 test2 test3
def readtst(tstfn):
outlist = list()
with open(tstfn) as br:
for aline in br.readlines():
aline = aline.strip()
outlist.append(aline)
return outlist
def split_train_tests_xml(xmlpath, test1fn, test2fn, test3fn):
test1list = readtst(test1fn)
test2list = readtst(test2fn)
test3list = readtst(test3fn)
outtrainlist = list() # full path ".xml.simp" files
outt1list = list() # test 1, full path ".xml.simp" files
outt2list = list()
outt3list = list()
for afile in os.listdir(xmlpath):
if not afile.endswith('.xml.simp'):
continue
afile2 = xmlpath + '/' + afile
aid = afile.split('.')[0]
if aid in test1list:
outt1list.append(afile2)
elif aid in test2list:
outt2list.append(afile2)
elif aid in test3list:
outt3list.append(afile2)
else:
outtrainlist.append(afile2)
return outtrainlist, outt1list, outt2list, outt3list
def all_wavs(wavpath):
wavlist = list()
for afile in os.listdir(wavpath):
if not afile.endswith('.wav'):
continue
afile2 = wavpath + '/' + afile
wavlist.append(afile2)
return wavlist
def gen_text(xmllist, outpath):
# id \t text
# e.g., /workspace/asr/wenet/examples/csj/s0/data/xml/S11M1689.xml.simp
# ID = S11M1689_stime_etime
outtxtfn = os.path.join(outpath, 'text')
with open(outtxtfn, 'w') as bw:
for xmlfn in xmllist:
aid = xmlfn.split('/')[-1]
aid2 = aid.split('.')[0]
with open(xmlfn) as br:
for aline in br.readlines():
aline = aline.strip()
# stime \t etime \t text1 \t text2 \t text3 \t text4 \t text5
cols = aline.split('\t')
# TODO different between "< 7" and "< 4"? strange
# -> use "< 4", DO NOT use "< 7" !
if len(cols) < 4:
continue
stime = cols[0]
etime = cols[1]
atxt = cols[3].replace(' ', '')
afullid = '{}_{}_{}'.format(aid2, stime, etime)
aoutline = '{}\t{}\n'.format(afullid, atxt)
bw.write(aoutline)
def parse_xml_set(xmllist):
outset = set()
for xml in xmllist:
aid = xml.split('/')[-1]
aid2 = aid.split('.')[0]
outset.add(aid2)
return outset
def gen_wav_scp(xmllist, wavlist, outpath):
# xmlset = pure id set, alike 'S04F1228'
# can be from train, test1, test2, or test3
xmlset = parse_xml_set(xmllist)
outwavscpfn = os.path.join(outpath, 'wav.scp')
with open(outwavscpfn, 'w') as bw:
for wav in wavlist:
# wav is alike "/workspace/asr/wenet/examples/csj/s0/data
# /wav/S04F1228.wav_00458.875_00459.209.wav"
aid = wav.split('/')[-1]
cols = aid.split('_')
aid2 = cols[0].split('.')[0]
if aid2 not in xmlset:
continue
stime = cols[1]
etime = cols[2].replace('.wav', '')
afullid = '{}_{}_{}'.format(aid2, stime, etime)
wavabspath = os.path.abspath(wav)
aoutline = '{}\t{}\n'.format(afullid, wavabspath)
bw.write(aoutline)
def prep_text_wavscp(
xmlpath, wavpath, test1fn, test2fn, test3fn,
outtrainpath, out1path, out2path, out3path):
trainlist, t1list, t2list, t3list = split_train_tests_xml(
xmlpath,
test1fn,
test2fn,
test3fn)
wavlist = all_wavs(wavpath)
gen_text(trainlist, outtrainpath)
gen_text(t1list, out1path)
gen_text(t2list, out2path)
gen_text(t3list, out3path)
gen_wav_scp(trainlist, wavlist, outtrainpath)
gen_wav_scp(t1list, wavlist, out1path)
gen_wav_scp(t2list, wavlist, out2path)
gen_wav_scp(t3list, wavlist, out3path)
if __name__ == '__main__':
if len(sys.argv) < 10:
print(
"Usage: {}".format(sys.argv[0]) + "<xmlpath> " +
"<wavpath> <test1fn> <test2fn> <test3fn> " +
"<outtrainpath> <out1path> <out2path> <out3path>")
exit(1)
xmlpath = sys.argv[1]
wavpath = sys.argv[2]
test1fn = sys.argv[3]
test2fn = sys.argv[4]
test3fn = sys.argv[5]
outtrainpath = sys.argv[6]
out1path = sys.argv[7]
out2path = sys.argv[8]
out3path = sys.argv[9]
prep_text_wavscp(xmlpath, wavpath, test1fn,
test2fn, test3fn, outtrainpath,
out1path, out2path, out3path)
import librosa
# import os
import sys
def mincut(wavscpfn, minsec):
outfn = wavscpfn + "_" + str(minsec)
with open(outfn, 'w') as bw:
with open(wavscpfn) as br:
for aline in br.readlines():
aline = aline.strip()
afn = aline.split('\t')[1]
# print(afn)
dur = librosa.get_duration(filename=afn)
if dur >= minsec:
bw.write(aline + '\n')
# wn.3.mincut.py <wav.scp> <min.sec>
if __name__ == '__main__':
if len(sys.argv) < 3:
print('{} <in.wav.scp> <min.sec.cut>'.format(sys.argv[0]))
exit()
wavscpfn = sys.argv[1]
minsec = float(sys.argv[2])
mincut(wavscpfn, minsec)
#!/usr/bin/env python3
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('--segments', default=None, help='segments file')
parser.add_argument('wav_file', help='wav file')
parser.add_argument('text_file', help='text file')
parser.add_argument('output_file', help='output list file')
args = parser.parse_args()
wav_table = {}
with open(args.wav_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
wav_table[arr[0]] = arr[1]
if args.segments is not None:
segments_table = {}
with open(args.segments, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 4
segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
with open(args.text_file, 'r', encoding='utf8') as fin, \
open(args.output_file, 'w', encoding='utf8') as fout:
for line in fin:
arr = line.strip().split(maxsplit=1)
key = arr[0]
txt = arr[1] if len(arr) > 1 else ''
if args.segments is None:
# assert key in wav_table
if key in wav_table:
wav = wav_table[key]
line = dict(key=key, wav=wav, txt=txt)
else:
line = None
else:
# assert key in segments_table
if key in segments_table:
wav_key, start, end = segments_table[key]
wav = wav_table[wav_key]
line = dict(key=key, wav=wav, txt=txt, start=start, end=end)
else:
line = None
if line:
json_line = json.dumps(line, ensure_ascii=False)
fout.write(json_line + '\n')
D01F0002
D01F0003
D01F0023
D01F0030
D01F0046
D01F0049
D01F0055
D01F0057
D01M0005
D01M0009
D01M0012
D01M0019
D01M0020
D01M0042
D01M0043
D01M0047
D02F0015
D02F0018
D02F0025
D02F0027
D02F0031
D02F0032
D02F0033
D02F0054
D02M0014
D02M0016
D02M0024
D02M0026
D02M0028
D02M0035
D02M0039
D02M0051
D03F0001
D03F0006
D03F0008
D03F0034
D03F0036
D03F0040
D03F0045
D03F0058
D03M0004
D03M0007
D03M0013
D03M0017
D03M0037
D03M0038
D03M0048
D03M0053
D04F0011
D04F0022
D04F0029
D04F0044
D04F0050
D04M0010
D04M0021
D04M0041
D04M0052
D04M0056
A01M0097
A04M0051
A04M0121
A03M0156
A03M0112
A01M0110
A05M0011
A03M0106
A01M0137
A04M0123
A01M0097
A04M0051
A04M0121
A03M0156
A03M0112
A01M0110
A05M0011
A03M0106
A01M0137
A04M0123
A01F0063
A01M0056
A06F0135
A02M0012
A06M0064
A01M0141
A01F0034
A03M0016
A03F0072
A01F0001
S00F0066
S00M0213
S00M0070
S00M0008
S01F0105
S00F0148
S00F0019
S00M0112
S00F0152
S00M0079
A01F0063
A01M0056
A06F0135
A02M0012
A06M0064
A01M0141
A01F0034
A03M0016
A03F0072
A01F0001
S00F0066
S00M0213
S00M0070
S00M0008
S01F0105
S00F0148
S00F0019
S00M0112
S00F0152
S00M0079
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# 1. xml split by sentences
# 2. wav split by xml.simp's guidance
# 3. generate "text" and "wav.scp" files as required by wenet
# 4. compute cmvn, better wav.len >= 0.1s, otherwise bug happens...
# 5. sentence piece's bpe vocabulary
# 6. make "data.list" files
# 7. train -> 50 epochs
stage=1 # train -> 50 epochs
stop_stage=8 #
# data
#data_url=www.openslr.org/resources/12
# TODO use your own data path
datadir=/workspace/asr/csj
# output wav data dir
wave_data=data # wave file path
# Optional train_config
train_config=conf/train_conformer.yaml
checkpoint=
cmvn=true # cmvn is for mean, variance, frame_number statistics
do_delta=false # not used...
dir=exp/sp_spec_aug # model's dir (output dir)
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
# maybe you can try to adjust it if you can not get close results as README.md
average_num=10
decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention"
. tools/parse_options.sh || exit 1;
# bpemode (unigram or bpe)
nbpe=4096 # TODO -> you can change this value to 5000, 100000 and so on
bpemode=bpe #unigram # TODO -> you can use unigram and other methods
set -e # if any line's exex result is not true, bash stops
set -u # show the error line when stops (failed)
set -o pipefail # return value of the whole bash = final line executed's result
train_set=train
dev_set=dev
recog_set="test1 test2 test3"
### CSJ data is not free!
# buying URL: https://ccd.ninjal.ac.jp/csj/en/
### data preparing - split xml by sentences ###
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### I did not check espnet nor kaldi for the pre-processing,
### I developed my own ways. so, use at your own risks.
echo "stage 1: Data preparation -> xml preprocessing "
echo " -> extract [start.time, end.time, text] from raw xml files"
python ./csj_tools/wn.0.parse.py $datadir ${wave_data}
fi
in_wav_path=$datadir/WAV
xml_simp_path=${wave_data}/xml
#wav_split_path=${wave_data}/wav.2
wav_split_path=${wave_data}/wav
mkdir -p ${wav_split_path}
### data preparing - split wav by xml.simp's guidance ###
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: Data preparation -> wav preprocessing "
echo " -> split wav file by xml.simp's [start.time, end.time, text] format"
# in addition, 2ch to 1ch!
python ./csj_tools/wn.1.split_wav.py ${in_wav_path} ${xml_simp_path} ${wav_split_path}
fi
### data preparing - generate "text" and "wav.scp" files ###
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: prepare text and wav.scp for train/test1/test2/test3 from wav and xml folders"
t1fn='list_files/test.set.1.list'
t2fn='list_files/test.set.2.list'
t3fn='list_files/test.set.3.list'
outtrain=${wave_data}/train
outt1=${wave_data}/test1
outt2=${wave_data}/test2
outt3=${wave_data}/test3
mkdir -p $outtrain
mkdir -p $outt1
mkdir -p $outt2
mkdir -p $outt3
python ./csj_tools/wn.2.prep.text.py \
${xml_simp_path} ${wav_split_path} \
$t1fn $t2fn $t3fn \
$outtrain $outt1 $outt2 $outt3
fi
minsec=0.1
### compute static info: mean, variance, frame_num ###
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "stage 4: Feature Generation"
# TODO if failed, then please make sure your wav files are all >= 0.1s ...
mkdir -p $wave_data/dev
# merge total dev data
for set in test1 test2 test3; do
for f in `ls $wave_data/$set`; do
cat $wave_data/$set/$f >> $wave_data/$dev_set/$f
done
done
python ./csj_tools/wn.3.mincut.py $wave_data/$train_set/wav.scp $minsec
tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \
--in_scp $wave_data/$train_set/wav.scp_$minsec \
--out_cmvn $wave_data/$train_set/global_cmvn
fi
### use sentence piece to construct subword vocabulary ###
dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 5: Dictionary and Json Data Preparation"
mkdir -p data/lang_char/
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
# we borrowed these code and scripts which are related bpe from ESPnet.
cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt
tools/spm_train \
--input=$wave_data/lang_char/input.txt \
--vocab_size=${nbpe} \
--model_type=${bpemode} \
--model_prefix=${bpemodel} \
--input_sentence_size=100000000
tools/spm_encode \
--model=${bpemodel}.model \
--output_format=piece < $wave_data/lang_char/input.txt | \
tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
wc -l ${dict}
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# Prepare wenet required data
echo "Prepare data, prepare required format"
for x in $train_set ; do
python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp_$minsec $wave_data/$x/text \
$wave_data/$x/data.list
done
for x in $dev_set ${recog_set} ; do
python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \
$wave_data/$x/data.list
done
fi
### Training! ###
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
cmvn_opts=
$cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type raw \
--symbol_table $dict \
--train_data $wave_data/$train_set/data.list \
--cv_data $wave_data/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $num_gpus \
--ddp.rank $i \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
### test model ###
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
# Test model, please specify the model you want to test by --checkpoint
cmvn_opts=
$cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
mkdir -p $dir/test
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=-1
ctc_weight=0.5
# Polling GPU id begin with index 0
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
idx=0
for test in $recog_set; do
for mode in ${decode_modes}; do
{
{
test_dir=$dir/${test}_${mode}
mkdir -p $test_dir
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
python wenet/bin/recognize.py --gpu $gpu_id \
--mode $mode \
--config $dir/train.yaml \
--data_type raw \
--test_data $wave_data/$test/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--result_file $test_dir/text_bpe \
--ctc_weight $ctc_weight \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
tools/spm_decode --model=${bpemodel}.model --input_format=piece \
< $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp
paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text
python tools/compute-wer.py --char=1 --v=1 \
$wave_data/$test/text $test_dir/text > $test_dir/wer
} &
((idx+=1))
if [ $idx -eq $num_gpus ]; then
idx=0
fi
}
done
done
wait
fi
../../../tools
\ No newline at end of file
../../../wenet
\ No newline at end of file
# GigaSpeech
A Large, modern and evolving dataset for automatic speech recognition. More details about GigaSpeech can be found: https://github.com/SpeechColab/GigaSpeech
# Performance Record
## Conformer bidecoder Result
* Feature info: using fbank feature, dither 1.0, cmvn, 16k
* Training info: conf/train_conformer_bidecoder.yaml, subsample 4, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 40 epochs
* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10
* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
### test set gigaspeech scoring
| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
|-----------|-------|--------|------|-----|-----|-----|------|-------|
| Sum/Avg | 19928 | 390656 | 91.4 | 6.4 | 2.2 | 2.0 | 10.6 | 63.1 |
| Mean | 152.1 | 2982.1 | 91.4 | 6.3 | 2.3 | 1.7 | 10.3 | 63.7 |
| S.D. | 142.2 | 2838.1 | 5.5 | 4.1 | 1.6 | 1.3 | 6.4 | 16.9 |
| Median | 108.0 | 2000.0 | 93.0 | 5.1 | 2.0 | 1.3 | 8.4 | 64.6 |
### dev set gigaspeech scoring
| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
|-----------|-------|--------|------|-----|-----|-----|------|-------|
| Sum/Avg | 5715 | 127790 | 92.1 | 5.8 | 2.1 | 2.8 | 10.7 | 69.9 |
| Mean | 204.1 | 4563.9 | 92.9 | 5.2 | 1.9 | 2.0 | 9.1 | 69.4 |
| S.D. | 269.7 | 4551.6 | 3.4 | 2.7 | 0.9 | 1.7 | 4.6 | 15.9 |
| Median | 151.5 | 3314.0 | 93.8 | 4.4 | 1.6 | 1.7 | 7.9 | 71.6 |
## Conformer U2++ Result
* Feature info: using fbank feature, dither 1.0, cmvn, 16k
* Training info: conf/train_u2++_conformer.yaml, subsample 6, kernel size 31, lr 0.001, batch size 28, 8 gpu, acc_grad 1, 50 epochs
* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10
* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
### test set gigaspeech scoring, full chunk (non-streaming)
| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
|-----------|-------|--------|------|-----|-----|-----|------|-------|
| Sum/Avg | 19928 | 390656 | 90.7 | 6.8 | 2.6 | 2.0 | 11.3 | 66.9 |
| Mean | 152.1 | 2982.1 | 90.6 | 6.8 | 2.7 | 1.6 | 11.1 | 67.1 |
| S.D. | 142.2 | 2838.1 | 5.8 | 4.3 | 1.9 | 1.2 | 6.7 | 16.5 |
| Median | 108.0 | 2000.0 | 92.1 | 5.7 | 2.2 | 1.3 | 9.0 | 68.9 |
### test set gigaspeech scoring, chunk 8 (latency range from 0 to 480ms)
| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
|-----------|-------|--------|------|-----|-----|-----|------|-------|
| Sum/Avg | 19928 | 390656 | 89.6 | 7.5 | 2.9 | 2.0 | 12.5 | 70.1 |
| Mean | 152.1 | 2982.1 | 89.3 | 7.6 | 3.1 | 1.7 | 12.4 | 70.6 |
| S.D. | 142.2 | 2838.1 | 6.5 | 4.9 | 2.1 | 1.2 | 7.3 | 15.8 |
| Median | 108.0 | 2000.0 | 91.1 | 6.3 | 2.5 | 1.4 | 10.2 | 72.2 |
## Conformer Result
* Feature info: using fbank feature, dither 1.0, no cmvn, 48k
* Training info: conf/train_conformer.yaml, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 30 epochs
* Decoding info: ctc_weight 0.5, average_num 5
* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27
### test set gigaspeech scoring
| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
|---------------|-------|--------|------|-----|-----|-----|------|-------|
| Sum/Avg | 19930 | 390744 | 90.8 | 6.9 | 2.3 | 2.0 | 11.2 | 65.1 |
| Mean | 152.1 | 2982.8 | 90.6 | 6.9 | 2.5 | 1.7 | 11.1 | 65.7 |
| S.D. | 142.3 | 2839.0 | 5.8 | 4.3 | 1.7 | 1.2 | 6.7 | 16.6 |
| Median | 108.0 | 2000.0 | 92.5 | 5.6 | 2.1 | 1.3 | 9.1 | 65.9 |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment