Commit 764b3a75 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
../../../wenet/
\ No newline at end of file
# conformer based end-to-end model for VKW challenge
## Standard E2E Results
Conformer without speed perpurb and lm
* config: conf/train_train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml
* beam: 10
* num of gpu: 8
* num of averaged model: 5
* ctc weight (used for attention rescoring): 0.5
dev set results trained only with training set (785 keywords, 1505 hour train set)
| scenario | Precision | Recall | F1 | ATWV |
|----------|-----------|----------|--------|--------|
| lgv | 0.9281 | 0.6420 | 0.7590 | 0.5183 |
| liv | 0.8886 | 0.6515 | 0.7518 | 0.6050 |
| stv | 0.9120 | 0.7471 | 0.8213 | 0.6256 |
dev set results trained with training set and finetune set (785 keywords, 1505 hour train set + 15 hour finetune set)
| scenario | Precision | Recall | F1 | ATWV |
|----------|-----------|----------|--------|--------|
| lgv | 0.9478 | 0.7311 | 0.8255 | 0.6352 |
| liv | 0.9177 | 0.8398 | 0.8770 | 0.7412 |
| stv | 0.9320 | 0.8207 | 0.8729 | 0.7120 |
test set results trained only with training set (384 keywords, 1505 hour train set)
| scenario | Precision | Recall | F1 | ATWV |
|----------|-----------|----------|--------|--------|
| lgv | 0.6262 | 0.5648 | 0.5939 | 0.5825 |
| liv | 0.8797 | 0.6282 | 0.7330 | 0.6061 |
| stv | 0.9102 | 0.7221 | 0.8053 | 0.6682 |
test set results trained with training set and finetune set (384 keywords, 1505 hour train set + 15 hour finetune set)
| scenario | Precision | Recall | F1 | ATWV |
|----------|-----------|----------|--------|--------|
| lgv | 0.6469 | 0.6276 | 0.6371 | 0.6116 |
| liv | 0.9278 | 0.7560 | 0.8331 | 0.6927 |
| stv | 0.9434 | 0.8061 | 0.8693 | 0.7275 |
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: false
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 100
log_interval: 400
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 8
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: false
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
# use raw_wav or kaldi feature
raw_wav: false
dataset_conf:
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 16
grad_clip: 5
accum_grad: 1
max_epoch: 100
log_interval: 400
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
#!/bin/bash
# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
# Apach 2.0
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=-1
stop_stage=0
# The num of nodes
num_nodes=1
# The rank of current node
node_rank=0
# data
data=data
dict=data/dict/lang_char.txt
data_type=raw # raw or shard
train_set=train
dev_set=combine_dev
finetune2_set=combine_finetune_5h
# Optional train_config
name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char
train_config=conf/${finetune2_set}_${name}.yaml
cmvn=true
dir=exp/${finetune2_set}_${name}_new
checkpoint= #$dir/0.pt
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=10
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# Data preparation
local/vkw_data_prep.sh
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
x=finetune_5h
for z in lgv liv stv; do
[ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \
mv data/vkw/label/lab_${z}/${x}/wav.scp \
data/vkw/label/lab_${z}/${x}/wav_ori.scp && \
cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \
> data/vkw/label/lab_${z}/${x}/wav.scp
done
y=`echo $x | cut -d "_" -f 1`
mkdir -p combine_${y}
for f in text wav.scp segments; do
for z in lgv liv stv; do
cat data/vkw/label/lab_${z}/${x}/$f
done > combine_${y}/$f
done
# remove the space between the text labels for Mandarin dataset
# download and transfer to wav.scp
cp data/${finetune2_set}/text data/${finetune2_set}/text.org
paste -d " " <(cut -f 1 -d" " data/${finetune2_set}/text.org) \
<(cut -f 2- -d" " data/${finetune2_set}/text.org | tr -d " ") \
> data/${finetune2_set}/text
rm data/${finetune2_set}/text.org
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: generate segmented wav.scp and compute cmvn"
## For wav feature, just copy the data. Fbank extraction is done in training
[ ! -f $data/$finetune2_set/segmentd_wav.scp ] && \
python tools/segment.py --segments $data/$finetune2_set/segments \
--input $data/$finetune2_set/wav.scp \
--output $data/$finetune2_set/segmented_wav.scp
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare data, prepare required format"
tools/make_raw_list.py --segments $data/$finetune2_set/segments \
$data/$finetune2_set/wav.scp $data/$finetune2_set/text $data/$finetune2_set/data.list
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp ${data}/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=$i ###`expr $node_rank \* $num_gpus + $i`
echo "start training"
[ ! -f exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt ] && \
echo "Please use a pretrained model for finetuning" && exit 0
[ ! -f $checkpoint ] && \
cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt $checkpoint && \
cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/0.yaml $dir/0.yaml
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data $data/${finetune2_set}/data.list \
--cv_data $data/${dev_set}/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 4 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
[ ! -f $decode_checkpoint ] && \
python3 wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Test model, please specify the model you want to use by --checkpoint
sets=${dev_set}
keywords_list=$data/vkw/keyword/kwlist
input_data=$feat_dir/${sets}/data.list
checkpoint=$dir/avg_${average_num}.pt
keyword_results=$dir/keyword_results_${sets}
ctc_results=$dir/ctc_results_${sets}
python3 local/vkw_kws_results.py --gpu 0 \
--config $dir/train.yaml \
--data_type $data_type \
--symbol_table $dict \
--num_workers 4 \
--prefetch 32 \
--input_data $input_data \
--checkpoint $checkpoint \
--keyword_unit_dict $keywords_list \
--keyword_results $keyword_results \
--ctc_results $ctc_results
[ ! -f scripts/bin/results_to_score.sh ] && \
ln -sf data/vkw/scripts scripts && chmod -R 755 scripts
### attention: install the F4DE tool before testing
for y in "stv" "lgv" "liv"; do
mkdir -p $dir/dev_${y}
#[ ! -f data/vkw/score/dev_${y}/utter_map ] && \
if [ $y == "lgv" ]; then
grep "TV1" $keyword_results > $dir/dev_${y}/kws_results
elif [ $y == "liv" ]; then
grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results
elif [ $y == "stv" ]; then
grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results
else
"invalid $y"
fi
./data/vkw/scripts/bin/results_to_score.sh \
data/vkw/score/dev_${y}/ecf \
data/vkw/label/lab_${y}/dev_5h/segments \
data/vkw/score/dev_${y}/utter_map \
$dir/dev_${y}/kws_results \
data/vkw/keyword/kwlist.xml \
data/vkw/score/dev_${y}/rttm
./data/vkw/scripts/bin/F1.sh \
$dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv
done
fi
#!/bin/bash
# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
# Apach 2.0
current_dir=$(pwd)
stage=0
stop_stage=0
. ./path.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cd $current_dir/data/
[ ! -z vkw_v1.1.zip ] && echo "wget vkw challenge data to this directory" && exit 0
[ ! -z vkw ] && unzip vkw_v1.1.zip
cd $current_dir
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
x=train
[ ! -f data/${x}/text ] && echo "vkw trainset is missing, wget to this directory" && exit 0
fi
echo "$0: vkw data preparation succeeded"
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
# Tencent (Yougen Yuan)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import copy
import logging
import os
import torch
import yaml
from torch.utils.data import DataLoader
from wenet.dataset.dataset import Dataset
from wenet.transformer.asr_model import init_asr_model
from wenet.utils.checkpoint import load_checkpoint
from wenet.utils.common import get_subsample
from wenet.utils.common import remove_duplicates_and_blank
from wenet.utils.file_utils import read_symbol_table
from wenet.utils.mask import make_pad_mask
def map_words2char(word_list_file):
word_unit_dict = {}
word_id_dict = {}
for line in open(word_list_file, mode="r", encoding="utf8"):
ids, keyword = line.split("\n")[0].split()
keyword_char = []
for i in keyword:
keyword_char.append(i)
word_unit_dict[keyword] = keyword_char
word_id_dict[keyword] = ids
return word_id_dict, word_unit_dict
def get_frames_timestamp(alignment):
# convert alignment to a praat format, which is a doing phonetics
# by computer and helps analyzing alignment
timestamp = []
# get frames level duration for each token
start = 0
end = 0
while end < len(alignment):
while end < len(alignment) and alignment[end] == 0:
end += 1
if end == len(alignment) and start < end:
if start == 0:
timestamp.append(alignment[start:])
else:
timestamp[-1] += alignment[start:]
break
end += 1
while end < len(alignment) and alignment[end - 1] == alignment[end]:
end += 1
timestamp.append(alignment[start:end])
start = end
return timestamp
def get_labformat_frames(timestamp, subsample, char_dict):
begin = 0
duration = 0
word_seq = []
word_time = []
for idx, t in enumerate(timestamp):
duration = len(t) * subsample
if idx < len(timestamp) - 1:
word_seq.append(char_dict[t[-1]])
word_time.append([begin, begin + duration])
else:
non_blank = 0
token = 0
for i in t:
if i != 0:
token = i
break
word_seq.append(char_dict[token])
word_time.append([begin, begin + duration])
begin = begin + duration
return word_seq, word_time
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='training your network')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument('--data_type',
default='raw',
choices=['raw', 'shard'],
help='train and cv data type')
parser.add_argument('--input_data', required=True, help='cv data file')
parser.add_argument('--gpu',
type=int,
default=-1,
help='gpu id for this local rank, -1 for cpu')
parser.add_argument('--checkpoint', required=True, help='checkpoint model')
parser.add_argument('--ddp.rank',
dest='rank',
default=0,
type=int,
help='global rank for distributed training')
parser.add_argument('--ddp.world_size',
dest='world_size',
default=-1,
type=int,
help='''number of total processes/gpus for
distributed training''')
parser.add_argument('--ddp.dist_backend',
dest='dist_backend',
default='nccl',
choices=['nccl', 'gloo'],
help='distributed backend')
parser.add_argument('--ddp.init_method',
dest='init_method',
default=None,
help='ddp init method')
parser.add_argument('--num_workers',
default=0,
type=int,
help='num of subprocess workers for reading')
parser.add_argument('--pin_memory',
action='store_true',
default=False,
help='Use pinned memory buffers used for reading')
parser.add_argument('--prefetch',
default=100,
type=int,
help='prefetch number')
parser.add_argument('--symbol_table',
required=True,
help='model unit symbol table for training')
parser.add_argument('--keyword_unit_dict',
required=True,
help='keyword id')
parser.add_argument('--keyword_results',
required=True,
help='keyword results')
parser.add_argument('--ctc_results', required=True, help='ctc results')
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
# Set random seed
torch.manual_seed(777)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
distributed = args.world_size > 1
if distributed:
logging.info('training on multiple gpus, this gpu {}'.format(args.gpu))
dist.init_process_group(args.dist_backend,
init_method=args.init_method,
world_size=args.world_size,
rank=args.rank)
symbol_table = read_symbol_table(args.symbol_table)
# Load dict
char_dict = {}
with open(args.symbol_table, mode='r') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
char_dict[int(arr[1])] = arr[0]
eos = len(char_dict) - 1
train_conf = configs['dataset_conf']
cv_conf = copy.deepcopy(train_conf)
cv_conf['speed_perturb'] = False
cv_conf['spec_aug'] = False
cv_dataset = Dataset(args.data_type,
args.input_data,
symbol_table,
cv_conf,
None,
partition=False)
cv_data_loader = DataLoader(cv_dataset,
batch_size=None,
pin_memory=args.pin_memory,
num_workers=args.num_workers,
prefetch_factor=args.prefetch)
print("Reading: ", args.keyword_unit_dict)
word_id_dict, word_unit_dict = map_words2char(args.keyword_unit_dict)
word_unit_list = list(word_unit_dict.keys())
print("word_unit_list has the size of %d" % (len(word_unit_list)))
# Init asr model from configs
model = init_asr_model(configs)
load_checkpoint(model, args.checkpoint)
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
model = model.to(device)
model.eval()
f_keyword_results = open(args.keyword_results, 'w', encoding='utf-8')
f_ctc_results = open(args.ctc_results, 'w', encoding='utf-8')
with torch.no_grad():
for batch_idx, batch in enumerate(cv_data_loader):
key, feat, target, feats_length, target_length = batch
feat = feat.to(device)
target = target.to(device)
feats_length = feats_length.to(device)
target_length = target_length.to(device)
# Let's assume B = batch_size and N = beam_size
# 1. Encoder
encoder_out, encoder_mask = model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1)
batch_size = encoder_out.size(0)
ctc_probs = model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size)
encoder_out_lens = encoder_mask.squeeze(1).sum(1)
topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1)
topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen)
mask = make_pad_mask(encoder_out_lens) # (B, maxlen)
topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen)
alignment = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in alignment]
for index, i in enumerate(key):
content = []
if len(hyps[index]) > 0:
for w in hyps[index]:
if w == eos:
break
content.append(char_dict[w])
f_ctc_results.write('{} {}\n'.format(i, " ".join(content)))
f_ctc_results.flush()
for index, i in enumerate(key):
timestamp = get_frames_timestamp(alignment[index])
subsample = get_subsample(configs)
word_seq, word_time = get_labformat_frames(
timestamp, subsample, char_dict)
for index_j in range(len(word_seq)):
for keyword in word_unit_list:
keyword_len = len(word_unit_dict[keyword])
if index_j + keyword_len > len(word_seq):
continue
if (word_seq[index_j:index_j +
keyword_len] == word_unit_dict[keyword]):
f_keyword_results.write("{} {} {} {} {}\n".format(
word_id_dict[keyword], i,
word_time[index_j][0],
word_time[index_j + keyword_len - 1][1], 0.0))
f_keyword_results.flush()
f_keyword_results.close()
f_ctc_results.close()
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2021 Tencent Inc. (Author: Yougen Yuan).
# Apach 2.0
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=-1
stop_stage=0
# The num of nodes
num_nodes=1
# The rank of current node
node_rank=0
# data
data=data
dict=data/dict/lang_char.txt
data_type=raw # raw or shard
train_set=train
dev_set=combine_dev
# Optional train_config
name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char
train_config=conf/train_${name}.yaml
cmvn=true
dir=exp/train_${name}_new
checkpoint= #$dir/0.pt
# use average_checkpoint will get better result
average_checkpoint=true
decode_checkpoint=$dir/final.pt
average_num=10
. tools/parse_options.sh || exit 1;
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# Data preparation
local/vkw_data_prep.sh
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
x=dev_5h
for z in lgv liv stv; do
[ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \
mv data/vkw/label/lab_${z}/${x}/wav.scp \
data/vkw/label/lab_${z}/${x}/wav_ori.scp && \
cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \
> data/vkw/label/lab_${z}/${x}/wav.scp
done
y=`echo $x | cut -d "_" -f 1`
mkdir -p combine_${y}
for f in text wav.scp segments; do
for z in lgv liv stv; do
cat data/vkw/label/lab_${z}/${x}/$f
done > combine_${y}/$f
done
# remove the space between the text labels for Mandarin dataset
# download and transfer to wav.scp
for x in ${dev_set} ${train_set}; do
cp data/${x}/text data/${x}/text.org
paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " \
data/${x}/text.org | tr -d " ") > data/${x}/text
rm data/${x}/text.org
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: generate segmented wav.scp and compute cmvn"
## For wav feature, just copy the data. Fbank extraction is done in training
for x in ${dev_set} ${train_set}; do
[ ! -f $data/$x/segmentd_wav.scp ] && \
python tools/segment.py --segments $data/$x/segments \
--input $data/$x/wav.scp \
--output $data/$x/segmented_wav.scp
done
### generate global_cmvn using training set
tools/compute_cmvn_stats.py --num_workers 12 --train_config $train_config \
--in_scp $data/${train_set}/segmented_wav.scp \
--out_cmvn $data/$train_set/global_cmvn
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Make train dict
echo "Make a dictionary"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
tools/text2token.py -s 1 -n 1 $data/${train_set}/text | cut -f 2- -d" " | \
tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | grep -P '[\p{Han}]'\
| awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare data, prepare required format"
for x in ${dev_set} ${train_set}; do
tools/make_raw_list.py --segments $data/$x/segments \
$data/$x/wav.scp $data/$x/text $data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Training
mkdir -p $dir
INIT_FILE=$dir/ddp_init
# You had better rm it manually before you start run.sh on first node.
# rm -f $INIT_FILE # delete old one before starting
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
# The number of gpus runing on each node/machine
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="gloo"
# The total number of processes/gpus, so that the master knows
# how many workers to wait for.
# More details about ddp can be found in
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp ${data}/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=$i ###`expr $node_rank \* $num_gpus + $i`
echo "start training"
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type $data_type \
--symbol_table $dict \
--train_data $data/$train_set/data.list \
--cv_data $data/${dev_set}/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
--num_workers 4 \
$cmvn_opts \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg_${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
[ ! -f $decode_checkpoint ] && \
python3 wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Test model, please specify the model you want to use by --checkpoint
sets=${dev_set}
keywords_list=$data/vkw/keyword/kwlist
input_data=$feat_dir/${sets}/data.list
checkpoint=$dir/avg_${average_num}.pt
keyword_results=$dir/keyword_results_${sets}
ctc_results=$dir/ctc_results_${sets}
python3 local/vkw_kws_results.py --gpu 0 \
--config $dir/train.yaml \
--data_type $data_type \
--symbol_table $dict \
--num_workers 4 \
--prefetch 32 \
--input_data $input_data \
--checkpoint $checkpoint \
--keyword_unit_dict $keywords_list \
--keyword_results $keyword_results \
--ctc_results $ctc_results
[ ! -f scripts/bin/results_to_score.sh ] && \
ln -sf data/vkw/scripts scripts && chmod -R 755 scripts
### attention: install the F4DE tool before testing
for y in "stv" "lgv" "liv"; do
mkdir -p $dir/dev_${y}
#[ ! -f data/vkw/score/dev_${y}/utter_map ] && \
if [ $y == "lgv" ]; then
grep "TV1" $keyword_results > $dir/dev_${y}/kws_results
elif [ $y == "liv" ]; then
grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results
elif [ $y == "stv" ]; then
grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results
else
"invalid $y"
fi
./data/vkw/scripts/bin/results_to_score.sh \
data/vkw/score/dev_${y}/ecf \
data/vkw/label/lab_${y}/dev_5h/segments \
data/vkw/score/dev_${y}/utter_map \
$dir/dev_${y}/kws_results \
data/vkw/keyword/kwlist.xml \
data/vkw/score/dev_${y}/rttm
./data/vkw/scripts/bin/F1.sh \
$dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv
done
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "adding 5h finetune data for each scenario to obtain better results"
local/run_finetune_5h.sh
fi
../../../tools/
\ No newline at end of file
../../../wenet/
\ No newline at end of file
# Performance Record
## Conformer
* Feature info: using fbank feature, with dither 1.0, with cmvn
* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs
* Decoding info: ctc_weight 0.5, average_num 10
| decoding_method | Dev | Test\_Net | Test\_Meeting |
|:-------------------:|:----:|:---------:|:-------------:|
| ctc_greedy_search | 8.88 | 10.29 | 15.96 |
| attention | 9.38 | 10.12 | 17.28 |
| attention_rescoring | 8.69 | 9.7 | 15.59 |
## Conformer bidecoder
* Feature info: using fbank feature, with dither 1.0, with cmvn
* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs
* Decoding info: ctc_weight 0.5, average_num 10
| decoding_method | Dev | Test\_Net | Test\_Meeting |
|:-------------------:|:----:|:---------:|:-------------:|
| ctc_greedy_search | 8.98 | 9.55 | 16.48 |
| attention | 9.42 | 10.57 | 18.05 |
| attention_rescoring | 8.85 | 9.25 | 16.18 |
## U2++ conformer
* Feature info: using fbank feature, with dither 1.0, with cmvn
* Training info: lr 0.001, batch size 48, 8 gpus on A100, acc_grad 16, 50 epochs
* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 10
| Decoding mode - Chunk size | Dev | Test\_Net | Test\_Meeting |
|:-----------------------------:|:----:|:---------:|:-------------:|
| ctc greedy search - full | 8.85 | 9.78 | 17.77 |
| ctc greedy search - 16 | 9.32 | 11.02 | 18.79 |
| ctc prefix beam search - full | 8.80 | 9.73 | 17.57 |
| ctc prefix beam search - 16 | 9.25 | 10.96 | 18.62 |
| attention rescoring - full | 8.60 | 9.26 | 17.34 |
| attention rescoring - 16 | 8.87 | 10.22 | 18.11 |
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
cnn_module_norm: 'layer_norm'
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
dataset_conf:
filter_conf:
max_length: 1200
min_length: 10
token_max_length: 100
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 30
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 1000 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 32
grad_clip: 5
accum_grad: 16
max_epoch: 26
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 5000
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
cnn_module_norm: 'layer_norm'
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3
r_num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3
dataset_conf:
filter_conf:
max_length: 1200
min_length: 10
token_max_length: 100
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: false
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 1.0
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 30
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 1000 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 32
grad_clip: 5
accum_grad: 16
max_epoch: 26
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 5000
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import argparse
import json
def get_args():
parser = argparse.ArgumentParser(description="""
This script is used to process raw json dataset of WenetSpeech,
where the long wav is splitinto segments and
data of wenet format is generated.
""")
parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
parser.add_argument('output_dir', help="""Output dir for prepared data""")
args = parser.parse_args()
return args
def meta_analysis(input_json, output_dir):
input_dir = os.path.dirname(input_json)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
try:
with open(input_json, 'r') as injson:
json_data = json.load(injson)
except Exception:
sys.exit(f'Failed to load input json file: {input_json}')
else:
if json_data['audios'] is not None:
with open(f'{output_dir}/text', 'w') as utt2text, \
open(f'{output_dir}/segments', 'w') as segments, \
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
open(f'{output_dir}/wav.scp', 'w') as wavscp, \
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
open(f'{output_dir}/reco2dur', 'w') as reco2dur:
for long_audio in json_data['audios']:
try:
long_audio_path = os.path.realpath(
os.path.join(input_dir, long_audio['path']))
aid = long_audio['aid']
segments_lists = long_audio['segments']
duration = long_audio['duration']
assert (os.path.exists(long_audio_path))
except AssertionError:
print(f'''Warning: {aid} something is wrong,
maybe AssertionError, skipped''')
continue
except Exception:
print(f'''Warning: {aid} something is wrong, maybe the
error path: {long_audio_path}, skipped''')
continue
else:
wavscp.write(f'{aid}\t{long_audio_path}\n')
reco2dur.write(f'{aid}\t{duration}\n')
for segment_file in segments_lists:
try:
sid = segment_file['sid']
start_time = segment_file['begin_time']
end_time = segment_file['end_time']
dur = end_time - start_time
text = segment_file['text']
segment_subsets = segment_file["subsets"]
except Exception:
print(f'''Warning: {segment_file} something
is wrong, skipped''')
continue
else:
utt2text.write(f'{sid}\t{text}\n')
segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
)
utt2dur.write(f'{sid}\t{dur}\n')
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
def main():
args = get_args()
meta_analysis(args.input_json, args.output_dir)
if __name__ == '__main__':
main()
# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# process_opus.py: segmentation and downsampling of opus audio
# usage: python3 process_opus.py wav.scp segments output_wav.scp
from pydub import AudioSegment
import sys
import os
def read_file(wav_scp, segments):
wav_scp_dict = {}
with open(wav_scp, 'r', encoding='UTF-8') as fin:
for line_str in fin:
wav_id, path = line_str.strip().split()
wav_scp_dict[wav_id] = path
utt_list = []
seg_path_list = []
start_time_list = []
end_time_list = []
with open(segments, 'r', encoding='UTF-8') as fin:
for line_str in fin:
arr = line_str.strip().split()
assert len(arr) == 4
utt_list.append(arr[0])
seg_path_list.append(wav_scp_dict[arr[1]])
start_time_list.append(float(arr[2]))
end_time_list.append(float(arr[3]))
return utt_list, seg_path_list, start_time_list, end_time_list
# TODO(Qijie): Fix the process logic
def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list):
num_utts = len(utt_list)
step = int(num_utts * 0.01)
with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
previous_wav_path = ""
for i in range(num_utts):
utt_id = utt_list[i]
current_wav_path = seg_path_list[i]
output_dir = (os.path.dirname(current_wav_path)) \
.replace("audio", 'audio_seg')
seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
# if not os.path.exists(output_dir):
# os.makedirs(output_dir)
if current_wav_path != previous_wav_path:
source_wav = AudioSegment.from_file(current_wav_path)
previous_wav_path = current_wav_path
start = int(start_time_list[i] * 1000)
end = int(end_time_list[i] * 1000)
target_audio = source_wav[start:end].set_frame_rate(16000) \
.set_sample_width(2)
target_audio.export(seg_wav_path, format="wav")
fout.write("{} {}\n".format(utt_id, seg_wav_path))
if i % step == 0:
print("seg wav finished: {}%".format(int(i / step)))
def main():
wav_scp = sys.argv[1]
segments = sys.argv[2]
output_wav_scp = sys.argv[3]
utt_list, seg_path_list, start_time_list, end_time_list \
= read_file(wav_scp, segments)
output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list)
if __name__ == '__main__':
main()
#!/usr/bin/env bash
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Seasalt AI, Inc (Author: Guoguo Chen)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -o pipefail
stage=1
prefix=
train_subset=L
. ./tools/parse_options.sh || exit 1;
filter_by_id () {
idlist=$1
input=$2
output=$3
field=1
if [ $# -eq 4 ]; then
field=$4
fi
cat $input | perl -se '
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
while(<>) {
@A = split;
@A > 0 || die "Invalid file line $_";
@A >= $field || die "Invalid file line $_";
if ($seen{$A[$field-1]}) {
print $_;
}
}' -- -idlist="$idlist" -field="$field" > $output ||\
(echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
}
subset_data_dir () {
utt_list=$1
src_dir=$2
dest_dir=$3
mkdir -p $dest_dir || exit 1;
# wav.scp text segments utt2dur
filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
(echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
(echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
(echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
(echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
rm -f $dest_dir/reco
}
if [ $# -ne 2 ]; then
echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
echo ""
echo "This script takes the WenetSpeech source directory, and prepares the"
echo "WeNet format data directory."
echo " --prefix <prefix> # Prefix for output data directory."
echo " --stage <stage> # Processing stage."
echo " --train-subset <L|M|S|W> # Train subset to be created."
exit 1
fi
wenetspeech_dir=$1
data_dir=$2
declare -A subsets
subsets=(
[L]="train_l"
[M]="train_m"
[S]="train_s"
[W]="train_w"
[DEV]="dev"
[TEST_NET]="test_net"
[TEST_MEETING]="test_meeting")
prefix=${prefix:+${prefix}_}
corpus_dir=$data_dir/${prefix}corpus/
if [ $stage -le 1 ]; then
echo "$0: Extract meta into $corpus_dir"
# Sanity check.
[ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
[ ! -d $wenetspeech_dir/audio ] &&\
echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
[ ! -d $corpus_dir ] && mkdir -p $corpus_dir
# Files to be created:
# wav.scp text segments utt2dur
python3 local/extract_meta.py \
$wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
fi
if [ $stage -le 2 ]; then
echo "$0: Split data to train, dev, test_net, and test_meeting"
[ ! -f $corpus_dir/utt2subsets ] &&\
echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
for label in $train_subset DEV TEST_NET TEST_MEETING; do
if [ ! ${subsets[$label]+set} ]; then
echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
fi
subset=${subsets[$label]}
[ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
cat $corpus_dir/utt2subsets | \
awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
> $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
$corpus_dir $data_dir/${prefix}$subset || exit 1;
done
fi
echo "$0: Done"
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:$PYTHONPATH
#!/bin/bash
# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
. ./path.sh || exit 1;
# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
stage=0
stop_stage=5
# The num of nodes
num_nodes=1
# The rank of current node
node_rank=0
# Use your own data path. You need to download the WenetSpeech dataset by yourself.
wenetspeech_data_dir=/ssd/nfs07/binbinzhang/wenetspeech
# Make sure you have 1.2T for ${shards_dir}
shards_dir=/ssd/nfs06/unified_data/wenetspeech_shards
# WenetSpeech training set
set=L
train_set=train_`echo $set | tr 'A-Z' 'a-z'`
dev_set=dev
test_sets="test_net test_meeting"
train_config=conf/train_conformer.yaml
checkpoint=
cmvn=true
cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
dir=exp/conformer
decode_checkpoint=
average_checkpoint=true
average_num=10
decode_modes="attention_rescoring ctc_greedy_search"
. tools/parse_options.sh || exit 1;
set -u
set -o pipefail
# Data download
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
exit 0;
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Data preparation"
local/wenetspeech_data_prep.sh \
--train-subset $set \
$wenetspeech_data_dir \
data || exit 1;
fi
dict=data/dict/lang_char.txt
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Make a dictionary"
echo "dictionary: ${dict}"
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
echo "▁ 2" >> ${dict} # ▁ is for space
tools/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \
| cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' \
| grep -v "▁" \
| awk '{print $0 " " NR+2}' >> ${dict} \
|| exit 1;
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Compute cmvn"
# Here we use all the training data, you can sample some some data to save time
# BUG!!! We should use the segmented data for CMVN
if $cmvn; then
full_size=`cat data/${train_set}/wav.scp | wc -l`
sampling_size=$((full_size / cmvn_sampling_divisor))
shuf -n $sampling_size data/$train_set/wav.scp \
> data/$train_set/wav.scp.sampled
python3 tools/compute_cmvn_stats.py \
--num_workers 16 \
--train_config $train_config \
--in_scp data/$train_set/wav.scp.sampled \
--out_cmvn data/$train_set/global_cmvn \
|| exit 1;
fi
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Making shards, please wait..."
RED='\033[0;31m'
NOCOLOR='\033[0m'
echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
for x in $dev_set $test_sets ${train_set}; do
dst=$shards_dir/$x
mkdir -p $dst
tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text \
$(realpath $dst) data/$x/data.list
done
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "Start training"
mkdir -p $dir
# INIT_FILE is for DDP synchronization
INIT_FILE=$dir/ddp_init
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
world_size=`expr $num_gpus \* $num_nodes`
echo "total gpus is: $world_size"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
for ((i = 0; i < $num_gpus; ++i)); do
{
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
# Rank of each gpu/process used for knowing whether it is
# the master of a worker.
rank=`expr $node_rank \* $num_gpus + $i`
python wenet/bin/train.py --gpu $gpu_id \
--config $train_config \
--data_type "shard" \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/$dev_set/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.init_method $init_method \
--ddp.world_size $world_size \
--ddp.rank $rank \
--ddp.dist_backend $dist_backend \
$cmvn_opts \
--num_workers 8 \
--pin_memory
} &
done
wait
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "Test model"
if [ ${average_checkpoint} == true ]; then
decode_checkpoint=$dir/avg${average_num}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python wenet/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path $dir \
--num ${average_num} \
--val_best
fi
# Specify decoding_chunk_size if it's a unified dynamic chunk trained model
# -1 for full chunk
decoding_chunk_size=
ctc_weight=0.5
reverse_weight=0.0
for testset in ${test_sets} ${dev_set}; do
{
for mode in ${decode_modes}; do
{
base=$(basename $decode_checkpoint)
result_dir=$dir/${testset}_${mode}_${base}
mkdir -p $result_dir
python wenet/bin/recognize.py --gpu 0 \
--mode $mode \
--config $dir/train.yaml \
--data_type "shard" \
--test_data data/$testset/data.list \
--checkpoint $decode_checkpoint \
--beam_size 10 \
--batch_size 1 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--reverse_weight $reverse_weight \
--result_file $result_dir/text \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
python tools/compute-wer.py --char=1 --v=1 \
data/$testset/text $result_dir/text > $result_dir/wer
}
done
wait
}
done
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "Export the best model you want"
python wenet/bin/export_jit.py \
--config $dir/train.yaml \
--checkpoint $dir/avg_${average_num}.pt \
--output_file $dir/final.zip
fi
../../../tools/
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment